recon/lib/wiki_rewrite.py

324 lines
11 KiB
Python
Raw Normal View History

"""
Wiki link rewriter rewrites OSM wikipedia/wikidata/wikivoyage/appropedia
links to local Kiwix URLs where the article exists in a loaded ZIM.
Falls back silently to public URLs when article is unavailable locally.
Caches positive results only in place_cache.db.
Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed
hourly to pick up newly loaded ZIMs without a restart.
Operations note:
- After loading a new ZIM, either restart RECON (forces fresh catalog
fetch) or wait up to 1 hour for automatic refresh.
- To invalidate the wiki cache (e.g. after ZIM update):
sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;"
"""
import os
import re
import sqlite3
import time
import xml.etree.ElementTree as ET
from urllib.parse import unquote, quote
import requests as http_requests
from .utils import setup_logging
logger = setup_logging('recon.wiki_rewrite')
# ── Configuration ───────────────────────────────────────────────────────
KIWIX_BASE = "http://localhost:8430"
KIWIX_PUBLIC_BASE = "https://wiki.echo6.co"
KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries"
HEAD_TIMEOUT = 1.5 # seconds
CATALOG_REFRESH_INTERVAL = 3600 # 1 hour
# OPDS Atom namespace
_ATOM_NS = "http://www.w3.org/2005/Atom"
# ── ZIM catalog map ─────────────────────────────────────────────────────
_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02'
_zim_map_ts = 0.0 # last refresh timestamp
# Prefix-to-source-type mapping (order matters: longest prefix first)
_ZIM_PREFIX_MAP = [
('wikipedia_en_all', 'wikipedia'),
('appropedia_en_all', 'appropedia'),
('wikivoyage_en', 'wikivoyage'),
('wikidata_en', 'wikidata'),
]
def _discover_zims():
"""Parse Kiwix OPDS Atom catalog to map source types to content paths."""
global _zim_map, _zim_map_ts
try:
resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5)
if resp.status_code != 200:
logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}")
return
root = ET.fromstring(resp.content)
new_map = {}
for entry in root.findall(f"{{{_ATOM_NS}}}entry"):
name_el = entry.find(f"{{{_ATOM_NS}}}name")
if name_el is None:
continue
book_name = name_el.text or ""
# <link type="text/html" href="/content/..."/>
content_path = None
for link in entry.findall(f"{{{_ATOM_NS}}}link"):
if link.get("type") == "text/html":
href = link.get("href", "")
if href.startswith("/content/"):
content_path = href[len("/content/"):]
break
if not content_path:
continue
# Match book name against known prefixes
for prefix, source_type in _ZIM_PREFIX_MAP:
if book_name.startswith(prefix):
new_map[source_type] = content_path
break
_zim_map = new_map
_zim_map_ts = time.time()
logger.info(f"ZIM catalog refreshed: {new_map}")
except Exception as e:
logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}")
def _ensure_zim_map():
"""Lazy-load and refresh ZIM map if stale."""
if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL:
_discover_zims()
# ── Database (wiki_cache in place_cache.db) ─────────────────────────────
_db_conn = None
def _get_db():
"""Return a module-level SQLite connection to place_cache.db (lazy init)."""
global _db_conn
if _db_conn is not None:
return _db_conn
db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data')
os.makedirs(db_dir, exist_ok=True)
db_path = os.path.join(db_dir, 'place_cache.db')
_db_conn = sqlite3.connect(db_path, check_same_thread=False)
_db_conn.execute("PRAGMA journal_mode=WAL")
_db_conn.execute("PRAGMA synchronous=NORMAL")
_db_conn.execute("""
CREATE TABLE IF NOT EXISTS wiki_cache (
source_type TEXT NOT NULL,
article_id TEXT NOT NULL,
kiwix_url TEXT NOT NULL,
cached_at INTEGER NOT NULL,
PRIMARY KEY (source_type, article_id)
)
""")
_db_conn.commit()
logger.info(f"Wiki cache table ready in {db_path}")
return _db_conn
# ── URL classification ──────────────────────────────────────────────────
# Patterns for OSM wikipedia/wikidata tag values
_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title"
_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)')
_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$')
_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)')
_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)')
_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)')
def _normalize_article_id(article_id):
"""Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores."""
return article_id.replace(' ', '_')
def classify_wiki_link(tag_name, value):
"""
Classify an OSM extratag value into (source_type, article_id) or None.
tag_name: the extratags key ('wikipedia', 'wikidata', etc.)
value: the raw tag value from OSM
Article IDs are normalized to MediaWiki convention (spaces underscores).
"""
if not value or not isinstance(value, str):
return None
value = value.strip()
if tag_name == 'wikidata':
m = _WIKIDATA_TAG_RE.match(value)
if m:
return ('wikidata', m.group(1))
m = _WIKIDATA_URL_RE.match(value)
if m:
return ('wikidata', m.group(1))
return None
if tag_name == 'wikipedia':
# URL form: https://en.wikipedia.org/wiki/Title
m = _WIKI_URL_RE.match(value)
if m:
return ('wikipedia', _normalize_article_id(unquote(m.group(1))))
# Tag form: "en:Title" or "Title"
m = _WIKI_TAG_RE.match(value)
if m:
return ('wikipedia', _normalize_article_id(m.group(1)))
return None
if tag_name == 'wikivoyage':
m = _WIKIVOYAGE_URL_RE.match(value)
if m:
return ('wikivoyage', _normalize_article_id(unquote(m.group(1))))
# Plain tag: "en:Title" or "Title"
m = _WIKI_TAG_RE.match(value)
if m:
return ('wikivoyage', _normalize_article_id(m.group(1)))
return None
if tag_name == 'appropedia':
m = _APPROPEDIA_URL_RE.match(value)
if m:
return ('appropedia', _normalize_article_id(unquote(m.group(1))))
return ('appropedia', _normalize_article_id(value))
return None
# ── URL builders ────────────────────────────────────────────────────────
def build_kiwix_url(source_type, article_id):
"""Build a public Kiwix URL. Returns None if source_type not in ZIM map."""
_ensure_zim_map()
content_path = _zim_map.get(source_type)
if not content_path:
return None
return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
_PUBLIC_URL_TEMPLATES = {
'wikipedia': "https://en.wikipedia.org/wiki/{id}",
'wikidata': "https://www.wikidata.org/wiki/{id}",
'wikivoyage': "https://en.wikivoyage.org/wiki/{id}",
'appropedia': "https://www.appropedia.org/wiki/{id}",
}
def build_public_url(source_type, article_id):
"""Build the canonical public URL for a wiki article."""
tmpl = _PUBLIC_URL_TEMPLATES.get(source_type)
if not tmpl:
return None
return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;='))
# ── Kiwix availability check ───────────────────────────────────────────
def check_kiwix_has_article(source_type, article_id):
"""
Check if an article exists in local Kiwix.
Returns (bool, url):
- (True, kiwix_public_url) if article exists locally
- (False, None) if not found or Kiwix unavailable
Only positive results are cached.
"""
# Check cache first
db = _get_db()
row = db.execute(
"SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?",
(source_type, article_id)
).fetchone()
if row:
return (True, row[0])
# Build local HEAD URL
_ensure_zim_map()
content_path = _zim_map.get(source_type)
if not content_path:
return (False, None)
head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}"
try:
resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True)
if resp.status_code == 200:
kiwix_url = build_kiwix_url(source_type, article_id)
# Cache positive result
now = int(time.time())
db.execute("""
INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at)
VALUES (?, ?, ?, ?)
""", (source_type, article_id, kiwix_url, now))
db.commit()
return (True, kiwix_url)
else:
return (False, None)
except Exception as e:
logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}")
return (False, None)
# ── Primary entry point ────────────────────────────────────────────────
def rewrite_wiki_link(tag_name, value):
"""
Rewrite an OSM wiki tag value to a local Kiwix URL if available.
Returns (url, 'local'|'public') or (None, None) if unrecognized.
"""
classified = classify_wiki_link(tag_name, value)
if not classified:
return (value, 'original')
source_type, article_id = classified
# Try local Kiwix
found, kiwix_url = check_kiwix_has_article(source_type, article_id)
if found and kiwix_url:
return (kiwix_url, 'local')
# Fall back to public URL
public_url = build_public_url(source_type, article_id)
if public_url:
return (public_url, 'public')
return (value, 'original')
# ── Discovery stubs (disabled, for future activation) ───────────────────
def discover_wikivoyage_article(name, category, lat, lon):
"""
Discover a related Wikivoyage article for a place.
Enabled by has_wiki_discovery. Currently returns None.
"""
return None
def discover_appropedia_article(name, category):
"""
Discover a related Appropedia article for a place.
Enabled by has_wiki_discovery. Currently returns None.
"""
return None