mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Add wiki link rewriting to local Kiwix
Rewrites OSM wikipedia/wikidata/wikivoyage/appropedia extratag values to local Kiwix URLs (wiki.echo6.co) when the article exists in a loaded ZIM, falling back silently to public URLs otherwise. - New lib/wiki_rewrite.py: URL classification, Kiwix OPDS catalog discovery (xml.etree.ElementTree), HEAD-based availability check, positive-only SQLite cache, disabled discovery stubs - place_detail.py: _enrich_wiki_links() at both Nominatim and Overpass enrichment sites, before cache_put - Profile flags: has_wiki_rewriting (home/regional: true, minimal: false), has_wiki_discovery (all: false, stubs for future activation) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9c5b0520f9
commit
829bc87b7b
5 changed files with 385 additions and 0 deletions
|
|
@ -38,10 +38,13 @@ features:
|
|||
has_3d_terrain: false
|
||||
has_traffic_overlay: true
|
||||
has_landclass: true
|
||||
has_public_lands_layer: true
|
||||
has_address_book_write: false
|
||||
has_overture_enrichment: true
|
||||
has_google_places_enrichment: true
|
||||
has_contacts: true
|
||||
has_wiki_rewriting: true
|
||||
has_wiki_discovery: false
|
||||
|
||||
defaults:
|
||||
center: [42.5736, -114.6066]
|
||||
|
|
|
|||
|
|
@ -33,10 +33,13 @@ features:
|
|||
has_3d_terrain: false
|
||||
has_traffic_overlay: false
|
||||
has_landclass: false
|
||||
has_public_lands_layer: false
|
||||
has_address_book_write: true
|
||||
has_overture_enrichment: false
|
||||
has_google_places_enrichment: false
|
||||
has_contacts: false
|
||||
has_wiki_rewriting: false
|
||||
has_wiki_discovery: false
|
||||
|
||||
defaults:
|
||||
center: [44.0, -114.0]
|
||||
|
|
|
|||
|
|
@ -38,10 +38,13 @@ features:
|
|||
has_3d_terrain: false
|
||||
has_traffic_overlay: true
|
||||
has_landclass: true
|
||||
has_public_lands_layer: true
|
||||
has_address_book_write: true
|
||||
has_overture_enrichment: false
|
||||
has_google_places_enrichment: false
|
||||
has_contacts: false
|
||||
has_wiki_rewriting: true
|
||||
has_wiki_discovery: false
|
||||
|
||||
defaults:
|
||||
center: [44.0, -114.0]
|
||||
|
|
|
|||
|
|
@ -272,6 +272,56 @@ def _apply_google_data(result, google_data, gaps):
|
|||
result['extratags'] = extratags
|
||||
|
||||
|
||||
|
||||
|
||||
# ── Wiki link rewriting ─────────────────────────────────────────────────
|
||||
|
||||
# Extratag keys that may contain wiki references
|
||||
_WIKI_TAGS = ('wikipedia', 'wikidata', 'wikivoyage', 'appropedia')
|
||||
|
||||
|
||||
def _enrich_wiki_links(result):
|
||||
"""
|
||||
Rewrite wiki-related extratags to local Kiwix URLs where available.
|
||||
Falls back to public URLs. Only runs when has_wiki_rewriting is enabled.
|
||||
Returns the (possibly enriched) result dict.
|
||||
"""
|
||||
try:
|
||||
from .deployment_config import get_deployment_config
|
||||
deploy_config = get_deployment_config()
|
||||
features = deploy_config.get('features', {})
|
||||
if not features.get('has_wiki_rewriting', False):
|
||||
return result
|
||||
except Exception:
|
||||
return result
|
||||
|
||||
try:
|
||||
from .wiki_rewrite import rewrite_wiki_link
|
||||
except ImportError:
|
||||
logger.debug("wiki_rewrite module not available")
|
||||
return result
|
||||
|
||||
extratags = result.get('extratags', {})
|
||||
if not extratags:
|
||||
return result
|
||||
|
||||
rewrites = {}
|
||||
for tag in _WIKI_TAGS:
|
||||
value = extratags.get(tag)
|
||||
if not value:
|
||||
continue
|
||||
url, status = rewrite_wiki_link(tag, value)
|
||||
if status != 'original':
|
||||
extratags[tag] = url
|
||||
rewrites[tag] = status
|
||||
|
||||
if rewrites:
|
||||
result['extratags'] = extratags
|
||||
result.setdefault('sources', {})['wiki_rewrites'] = rewrites
|
||||
logger.debug(f"Wiki rewrites for {result.get('osm_type')}/{result.get('osm_id')}: {rewrites}")
|
||||
|
||||
return result
|
||||
|
||||
# ── Nominatim parsing ───────────────────────────────────────────────────
|
||||
|
||||
# Nominatim address array uses rank_address to indicate what each entry is.
|
||||
|
|
@ -560,6 +610,7 @@ def get_place_detail(osm_type, osm_id):
|
|||
if nominatim_result:
|
||||
nominatim_result = _enrich_with_overture(nominatim_result, osm_type, osm_id)
|
||||
nominatim_result = _enrich_with_google(nominatim_result, osm_type, osm_id)
|
||||
nominatim_result = _enrich_wiki_links(nominatim_result)
|
||||
cache_put(osm_type, osm_id, nominatim_result, 'nominatim_local')
|
||||
return nominatim_result, 200
|
||||
|
||||
|
|
@ -592,6 +643,7 @@ def get_place_detail(osm_type, osm_id):
|
|||
if overpass_result:
|
||||
overpass_result = _enrich_with_overture(overpass_result, osm_type, osm_id)
|
||||
overpass_result = _enrich_with_google(overpass_result, osm_type, osm_id)
|
||||
overpass_result = _enrich_wiki_links(overpass_result)
|
||||
cache_put(osm_type, osm_id, overpass_result, 'overpass')
|
||||
return overpass_result, 200
|
||||
|
||||
|
|
|
|||
324
lib/wiki_rewrite.py
Normal file
324
lib/wiki_rewrite.py
Normal file
|
|
@ -0,0 +1,324 @@
|
|||
"""
|
||||
Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia
|
||||
links to local Kiwix URLs where the article exists in a loaded ZIM.
|
||||
|
||||
Falls back silently to public URLs when article is unavailable locally.
|
||||
Caches positive results only in place_cache.db.
|
||||
|
||||
Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed
|
||||
hourly to pick up newly loaded ZIMs without a restart.
|
||||
|
||||
Operations note:
|
||||
- After loading a new ZIM, either restart RECON (forces fresh catalog
|
||||
fetch) or wait up to 1 hour for automatic refresh.
|
||||
- To invalidate the wiki cache (e.g. after ZIM update):
|
||||
sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;"
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.parse import unquote, quote
|
||||
|
||||
import requests as http_requests
|
||||
|
||||
from .utils import setup_logging
|
||||
|
||||
logger = setup_logging('recon.wiki_rewrite')
|
||||
|
||||
# ── Configuration ───────────────────────────────────────────────────────
|
||||
|
||||
KIWIX_BASE = "http://localhost:8430"
|
||||
KIWIX_PUBLIC_BASE = "https://wiki.echo6.co"
|
||||
KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries"
|
||||
HEAD_TIMEOUT = 1.5 # seconds
|
||||
CATALOG_REFRESH_INTERVAL = 3600 # 1 hour
|
||||
|
||||
# OPDS Atom namespace
|
||||
_ATOM_NS = "http://www.w3.org/2005/Atom"
|
||||
|
||||
# ── ZIM catalog map ─────────────────────────────────────────────────────
|
||||
|
||||
_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02'
|
||||
_zim_map_ts = 0.0 # last refresh timestamp
|
||||
|
||||
# Prefix-to-source-type mapping (order matters: longest prefix first)
|
||||
_ZIM_PREFIX_MAP = [
|
||||
('wikipedia_en_all', 'wikipedia'),
|
||||
('appropedia_en_all', 'appropedia'),
|
||||
('wikivoyage_en', 'wikivoyage'),
|
||||
('wikidata_en', 'wikidata'),
|
||||
]
|
||||
|
||||
|
||||
def _discover_zims():
|
||||
"""Parse Kiwix OPDS Atom catalog to map source types to content paths."""
|
||||
global _zim_map, _zim_map_ts
|
||||
|
||||
try:
|
||||
resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}")
|
||||
return
|
||||
|
||||
root = ET.fromstring(resp.content)
|
||||
new_map = {}
|
||||
|
||||
for entry in root.findall(f"{{{_ATOM_NS}}}entry"):
|
||||
name_el = entry.find(f"{{{_ATOM_NS}}}name")
|
||||
if name_el is None:
|
||||
continue
|
||||
book_name = name_el.text or ""
|
||||
|
||||
# <link type="text/html" href="/content/..."/>
|
||||
content_path = None
|
||||
for link in entry.findall(f"{{{_ATOM_NS}}}link"):
|
||||
if link.get("type") == "text/html":
|
||||
href = link.get("href", "")
|
||||
if href.startswith("/content/"):
|
||||
content_path = href[len("/content/"):]
|
||||
break
|
||||
|
||||
if not content_path:
|
||||
continue
|
||||
|
||||
# Match book name against known prefixes
|
||||
for prefix, source_type in _ZIM_PREFIX_MAP:
|
||||
if book_name.startswith(prefix):
|
||||
new_map[source_type] = content_path
|
||||
break
|
||||
|
||||
_zim_map = new_map
|
||||
_zim_map_ts = time.time()
|
||||
logger.info(f"ZIM catalog refreshed: {new_map}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}")
|
||||
|
||||
|
||||
def _ensure_zim_map():
|
||||
"""Lazy-load and refresh ZIM map if stale."""
|
||||
if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL:
|
||||
_discover_zims()
|
||||
|
||||
|
||||
# ── Database (wiki_cache in place_cache.db) ─────────────────────────────
|
||||
|
||||
_db_conn = None
|
||||
|
||||
|
||||
def _get_db():
|
||||
"""Return a module-level SQLite connection to place_cache.db (lazy init)."""
|
||||
global _db_conn
|
||||
if _db_conn is not None:
|
||||
return _db_conn
|
||||
|
||||
db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
|
||||
os.makedirs(db_dir, exist_ok=True)
|
||||
db_path = os.path.join(db_dir, 'place_cache.db')
|
||||
|
||||
_db_conn = sqlite3.connect(db_path, check_same_thread=False)
|
||||
_db_conn.execute("PRAGMA journal_mode=WAL")
|
||||
_db_conn.execute("PRAGMA synchronous=NORMAL")
|
||||
_db_conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS wiki_cache (
|
||||
source_type TEXT NOT NULL,
|
||||
article_id TEXT NOT NULL,
|
||||
kiwix_url TEXT NOT NULL,
|
||||
cached_at INTEGER NOT NULL,
|
||||
PRIMARY KEY (source_type, article_id)
|
||||
)
|
||||
""")
|
||||
_db_conn.commit()
|
||||
logger.info(f"Wiki cache table ready in {db_path}")
|
||||
return _db_conn
|
||||
|
||||
|
||||
# ── URL classification ──────────────────────────────────────────────────
|
||||
|
||||
# Patterns for OSM wikipedia/wikidata tag values
|
||||
_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title"
|
||||
_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)')
|
||||
_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$')
|
||||
_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)')
|
||||
_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)')
|
||||
_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)')
|
||||
|
||||
|
||||
def _normalize_article_id(article_id):
|
||||
"""Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores."""
|
||||
return article_id.replace(' ', '_')
|
||||
|
||||
|
||||
def classify_wiki_link(tag_name, value):
|
||||
"""
|
||||
Classify an OSM extratag value into (source_type, article_id) or None.
|
||||
|
||||
tag_name: the extratags key ('wikipedia', 'wikidata', etc.)
|
||||
value: the raw tag value from OSM
|
||||
|
||||
Article IDs are normalized to MediaWiki convention (spaces → underscores).
|
||||
"""
|
||||
if not value or not isinstance(value, str):
|
||||
return None
|
||||
|
||||
value = value.strip()
|
||||
|
||||
if tag_name == 'wikidata':
|
||||
m = _WIKIDATA_TAG_RE.match(value)
|
||||
if m:
|
||||
return ('wikidata', m.group(1))
|
||||
m = _WIKIDATA_URL_RE.match(value)
|
||||
if m:
|
||||
return ('wikidata', m.group(1))
|
||||
return None
|
||||
|
||||
if tag_name == 'wikipedia':
|
||||
# URL form: https://en.wikipedia.org/wiki/Title
|
||||
m = _WIKI_URL_RE.match(value)
|
||||
if m:
|
||||
return ('wikipedia', _normalize_article_id(unquote(m.group(1))))
|
||||
# Tag form: "en:Title" or "Title"
|
||||
m = _WIKI_TAG_RE.match(value)
|
||||
if m:
|
||||
return ('wikipedia', _normalize_article_id(m.group(1)))
|
||||
return None
|
||||
|
||||
if tag_name == 'wikivoyage':
|
||||
m = _WIKIVOYAGE_URL_RE.match(value)
|
||||
if m:
|
||||
return ('wikivoyage', _normalize_article_id(unquote(m.group(1))))
|
||||
# Plain tag: "en:Title" or "Title"
|
||||
m = _WIKI_TAG_RE.match(value)
|
||||
if m:
|
||||
return ('wikivoyage', _normalize_article_id(m.group(1)))
|
||||
return None
|
||||
|
||||
if tag_name == 'appropedia':
|
||||
m = _APPROPEDIA_URL_RE.match(value)
|
||||
if m:
|
||||
return ('appropedia', _normalize_article_id(unquote(m.group(1))))
|
||||
return ('appropedia', _normalize_article_id(value))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── URL builders ────────────────────────────────────────────────────────
|
||||
|
||||
def build_kiwix_url(source_type, article_id):
|
||||
"""Build a public Kiwix URL. Returns None if source_type not in ZIM map."""
|
||||
_ensure_zim_map()
|
||||
content_path = _zim_map.get(source_type)
|
||||
if not content_path:
|
||||
return None
|
||||
return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
|
||||
|
||||
|
||||
_PUBLIC_URL_TEMPLATES = {
|
||||
'wikipedia': "https://en.wikipedia.org/wiki/{id}",
|
||||
'wikidata': "https://www.wikidata.org/wiki/{id}",
|
||||
'wikivoyage': "https://en.wikivoyage.org/wiki/{id}",
|
||||
'appropedia': "https://www.appropedia.org/wiki/{id}",
|
||||
}
|
||||
|
||||
|
||||
def build_public_url(source_type, article_id):
|
||||
"""Build the canonical public URL for a wiki article."""
|
||||
tmpl = _PUBLIC_URL_TEMPLATES.get(source_type)
|
||||
if not tmpl:
|
||||
return None
|
||||
return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;='))
|
||||
|
||||
|
||||
# ── Kiwix availability check ───────────────────────────────────────────
|
||||
|
||||
def check_kiwix_has_article(source_type, article_id):
|
||||
"""
|
||||
Check if an article exists in local Kiwix.
|
||||
|
||||
Returns (bool, url):
|
||||
- (True, kiwix_public_url) if article exists locally
|
||||
- (False, None) if not found or Kiwix unavailable
|
||||
|
||||
Only positive results are cached.
|
||||
"""
|
||||
# Check cache first
|
||||
db = _get_db()
|
||||
row = db.execute(
|
||||
"SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?",
|
||||
(source_type, article_id)
|
||||
).fetchone()
|
||||
if row:
|
||||
return (True, row[0])
|
||||
|
||||
# Build local HEAD URL
|
||||
_ensure_zim_map()
|
||||
content_path = _zim_map.get(source_type)
|
||||
if not content_path:
|
||||
return (False, None)
|
||||
|
||||
head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
|
||||
|
||||
try:
|
||||
resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True)
|
||||
if resp.status_code == 200:
|
||||
kiwix_url = build_kiwix_url(source_type, article_id)
|
||||
# Cache positive result
|
||||
now = int(time.time())
|
||||
db.execute("""
|
||||
INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""", (source_type, article_id, kiwix_url, now))
|
||||
db.commit()
|
||||
return (True, kiwix_url)
|
||||
else:
|
||||
return (False, None)
|
||||
except Exception as e:
|
||||
logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}")
|
||||
return (False, None)
|
||||
|
||||
|
||||
# ── Primary entry point ────────────────────────────────────────────────
|
||||
|
||||
def rewrite_wiki_link(tag_name, value):
|
||||
"""
|
||||
Rewrite an OSM wiki tag value to a local Kiwix URL if available.
|
||||
|
||||
Returns (url, 'local'|'public') or (None, None) if unrecognized.
|
||||
"""
|
||||
classified = classify_wiki_link(tag_name, value)
|
||||
if not classified:
|
||||
return (value, 'original')
|
||||
|
||||
source_type, article_id = classified
|
||||
|
||||
# Try local Kiwix
|
||||
found, kiwix_url = check_kiwix_has_article(source_type, article_id)
|
||||
if found and kiwix_url:
|
||||
return (kiwix_url, 'local')
|
||||
|
||||
# Fall back to public URL
|
||||
public_url = build_public_url(source_type, article_id)
|
||||
if public_url:
|
||||
return (public_url, 'public')
|
||||
|
||||
return (value, 'original')
|
||||
|
||||
|
||||
# ── Discovery stubs (disabled, for future activation) ───────────────────
|
||||
|
||||
def discover_wikivoyage_article(name, category, lat, lon):
|
||||
"""
|
||||
Discover a related Wikivoyage article for a place.
|
||||
Enabled by has_wiki_discovery. Currently returns None.
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
def discover_appropedia_article(name, category):
|
||||
"""
|
||||
Discover a related Appropedia article for a place.
|
||||
Enabled by has_wiki_discovery. Currently returns None.
|
||||
"""
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue