From 6365fe67565ffabe67f5a0d3647f1aaa263bc983 Mon Sep 17 00:00:00 2001 From: malice Date: Sat, 23 May 2026 21:21:22 -0600 Subject: [PATCH] decouple: remove /api/wiki-rewrite (migrated to navi-places) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-B of decouple #4-REWRITE — the LAST recon→navi decoupling step. navi-places now owns the Kiwix link-rewrite logic in-process (navi-backend PR-A 7103c27, deployed + verified: Twin Falls live route returns wiki_rewrites local/public from navi's own wiki_cache.db; zero outbound calls to recon /api/wiki-rewrite). - DELETE lib/wiki_rewrite.py (the Kiwix rewrite logic — ported to navi-places). - DELETE lib/wiki_rewrite_api.py (the /api/wiki-rewrite blueprint). - DELETE lib/wiki_rewrite_api_test.py (tests the deleted endpoint). - api.py: drop the wiki_rewrite_bp import + register_blueprint + section comment. Verified zero recon consumers: nothing in recon imports wiki_rewrite — it was purely an HTTP endpoint for navi-places. After this, recon services make and receive zero navi-ecosystem runtime calls; recon is a fully separate product. Out-of-band (post-deploy): DROP TABLE wiki_cache from /opt/recon/data/place_cache.db (table only — place_cache + google_api_calls stay). Co-authored-by: Claude Opus 4.7 (1M context) --- lib/api.py | 5 - lib/wiki_rewrite.py | 324 ----------------------------------- lib/wiki_rewrite_api.py | 34 ---- lib/wiki_rewrite_api_test.py | 73 -------- 4 files changed, 436 deletions(-) delete mode 100644 lib/wiki_rewrite.py delete mode 100644 lib/wiki_rewrite_api.py delete mode 100644 lib/wiki_rewrite_api_test.py diff --git a/lib/api.py b/lib/api.py index 63562f7..576f4ee 100644 --- a/lib/api.py +++ b/lib/api.py @@ -62,11 +62,6 @@ app.request_class = _LargeZimRequest from .netsyms_api import netsyms_bp app.register_blueprint(netsyms_bp) -# ── Wiki-rewrite Blueprint (extraction #5 prep — HTTP wrapper over rewrite_wiki_link) ── -from .wiki_rewrite_api import wiki_rewrite_bp -app.register_blueprint(wiki_rewrite_bp) - - # ── Navigation Constants ── diff --git a/lib/wiki_rewrite.py b/lib/wiki_rewrite.py deleted file mode 100644 index d884635..0000000 --- a/lib/wiki_rewrite.py +++ /dev/null @@ -1,324 +0,0 @@ -""" -Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia -links to local Kiwix URLs where the article exists in a loaded ZIM. - -Falls back silently to public URLs when article is unavailable locally. -Caches positive results only in place_cache.db. - -Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed -hourly to pick up newly loaded ZIMs without a restart. - -Operations note: - - After loading a new ZIM, either restart RECON (forces fresh catalog - fetch) or wait up to 1 hour for automatic refresh. - - To invalidate the wiki cache (e.g. after ZIM update): - sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;" -""" -import os -import re -import sqlite3 -import time -import xml.etree.ElementTree as ET -from urllib.parse import unquote, quote - -import requests as http_requests - -from .utils import setup_logging - -logger = setup_logging('recon.wiki_rewrite') - -# ── Configuration ─────────────────────────────────────────────────────── - -KIWIX_BASE = "http://localhost:8430" -KIWIX_PUBLIC_BASE = "https://wiki.echo6.co" -KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries" -HEAD_TIMEOUT = 1.5 # seconds -CATALOG_REFRESH_INTERVAL = 3600 # 1 hour - -# OPDS Atom namespace -_ATOM_NS = "http://www.w3.org/2005/Atom" - -# ── ZIM catalog map ───────────────────────────────────────────────────── - -_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02' -_zim_map_ts = 0.0 # last refresh timestamp - -# Prefix-to-source-type mapping (order matters: longest prefix first) -_ZIM_PREFIX_MAP = [ - ('wikipedia_en_all', 'wikipedia'), - ('appropedia_en_all', 'appropedia'), - ('wikivoyage_en', 'wikivoyage'), - ('wikidata_en', 'wikidata'), -] - - -def _discover_zims(): - """Parse Kiwix OPDS Atom catalog to map source types to content paths.""" - global _zim_map, _zim_map_ts - - try: - resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5) - if resp.status_code != 200: - logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}") - return - - root = ET.fromstring(resp.content) - new_map = {} - - for entry in root.findall(f"{{{_ATOM_NS}}}entry"): - name_el = entry.find(f"{{{_ATOM_NS}}}name") - if name_el is None: - continue - book_name = name_el.text or "" - - # - content_path = None - for link in entry.findall(f"{{{_ATOM_NS}}}link"): - if link.get("type") == "text/html": - href = link.get("href", "") - if href.startswith("/content/"): - content_path = href[len("/content/"):] - break - - if not content_path: - continue - - # Match book name against known prefixes - for prefix, source_type in _ZIM_PREFIX_MAP: - if book_name.startswith(prefix): - new_map[source_type] = content_path - break - - _zim_map = new_map - _zim_map_ts = time.time() - logger.info(f"ZIM catalog refreshed: {new_map}") - - except Exception as e: - logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}") - - -def _ensure_zim_map(): - """Lazy-load and refresh ZIM map if stale.""" - if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL: - _discover_zims() - - -# ── Database (wiki_cache in place_cache.db) ───────────────────────────── - -_db_conn = None - - -def _get_db(): - """Return a module-level SQLite connection to place_cache.db (lazy init).""" - global _db_conn - if _db_conn is not None: - return _db_conn - - db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') - os.makedirs(db_dir, exist_ok=True) - db_path = os.path.join(db_dir, 'place_cache.db') - - _db_conn = sqlite3.connect(db_path, check_same_thread=False) - _db_conn.execute("PRAGMA journal_mode=WAL") - _db_conn.execute("PRAGMA synchronous=NORMAL") - _db_conn.execute(""" - CREATE TABLE IF NOT EXISTS wiki_cache ( - source_type TEXT NOT NULL, - article_id TEXT NOT NULL, - kiwix_url TEXT NOT NULL, - cached_at INTEGER NOT NULL, - PRIMARY KEY (source_type, article_id) - ) - """) - _db_conn.commit() - logger.info(f"Wiki cache table ready in {db_path}") - return _db_conn - - -# ── URL classification ────────────────────────────────────────────────── - -# Patterns for OSM wikipedia/wikidata tag values -_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title" -_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)') -_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$') -_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)') -_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)') -_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)') - - -def _normalize_article_id(article_id): - """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores.""" - return article_id.replace(' ', '_') - - -def classify_wiki_link(tag_name, value): - """ - Classify an OSM extratag value into (source_type, article_id) or None. - - tag_name: the extratags key ('wikipedia', 'wikidata', etc.) - value: the raw tag value from OSM - - Article IDs are normalized to MediaWiki convention (spaces → underscores). - """ - if not value or not isinstance(value, str): - return None - - value = value.strip() - - if tag_name == 'wikidata': - m = _WIKIDATA_TAG_RE.match(value) - if m: - return ('wikidata', m.group(1)) - m = _WIKIDATA_URL_RE.match(value) - if m: - return ('wikidata', m.group(1)) - return None - - if tag_name == 'wikipedia': - # URL form: https://en.wikipedia.org/wiki/Title - m = _WIKI_URL_RE.match(value) - if m: - return ('wikipedia', _normalize_article_id(unquote(m.group(1)))) - # Tag form: "en:Title" or "Title" - m = _WIKI_TAG_RE.match(value) - if m: - return ('wikipedia', _normalize_article_id(m.group(1))) - return None - - if tag_name == 'wikivoyage': - m = _WIKIVOYAGE_URL_RE.match(value) - if m: - return ('wikivoyage', _normalize_article_id(unquote(m.group(1)))) - # Plain tag: "en:Title" or "Title" - m = _WIKI_TAG_RE.match(value) - if m: - return ('wikivoyage', _normalize_article_id(m.group(1))) - return None - - if tag_name == 'appropedia': - m = _APPROPEDIA_URL_RE.match(value) - if m: - return ('appropedia', _normalize_article_id(unquote(m.group(1)))) - return ('appropedia', _normalize_article_id(value)) - - return None - - -# ── URL builders ──────────────────────────────────────────────────────── - -def build_kiwix_url(source_type, article_id): - """Build a public Kiwix URL. Returns None if source_type not in ZIM map.""" - _ensure_zim_map() - content_path = _zim_map.get(source_type) - if not content_path: - return None - return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" - - -_PUBLIC_URL_TEMPLATES = { - 'wikipedia': "https://en.wikipedia.org/wiki/{id}", - 'wikidata': "https://www.wikidata.org/wiki/{id}", - 'wikivoyage': "https://en.wikivoyage.org/wiki/{id}", - 'appropedia': "https://www.appropedia.org/wiki/{id}", -} - - -def build_public_url(source_type, article_id): - """Build the canonical public URL for a wiki article.""" - tmpl = _PUBLIC_URL_TEMPLATES.get(source_type) - if not tmpl: - return None - return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;=')) - - -# ── Kiwix availability check ─────────────────────────────────────────── - -def check_kiwix_has_article(source_type, article_id): - """ - Check if an article exists in local Kiwix. - - Returns (bool, url): - - (True, kiwix_public_url) if article exists locally - - (False, None) if not found or Kiwix unavailable - - Only positive results are cached. - """ - # Check cache first - db = _get_db() - row = db.execute( - "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?", - (source_type, article_id) - ).fetchone() - if row: - return (True, row[0]) - - # Build local HEAD URL - _ensure_zim_map() - content_path = _zim_map.get(source_type) - if not content_path: - return (False, None) - - head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" - - try: - resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True) - if resp.status_code == 200: - kiwix_url = build_kiwix_url(source_type, article_id) - # Cache positive result - now = int(time.time()) - db.execute(""" - INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at) - VALUES (?, ?, ?, ?) - """, (source_type, article_id, kiwix_url, now)) - db.commit() - return (True, kiwix_url) - else: - return (False, None) - except Exception as e: - logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}") - return (False, None) - - -# ── Primary entry point ──────────────────────────────────────────────── - -def rewrite_wiki_link(tag_name, value): - """ - Rewrite an OSM wiki tag value to a local Kiwix URL if available. - - Returns (url, 'local'|'public') or (None, None) if unrecognized. - """ - classified = classify_wiki_link(tag_name, value) - if not classified: - return (value, 'original') - - source_type, article_id = classified - - # Try local Kiwix - found, kiwix_url = check_kiwix_has_article(source_type, article_id) - if found and kiwix_url: - return (kiwix_url, 'local') - - # Fall back to public URL - public_url = build_public_url(source_type, article_id) - if public_url: - return (public_url, 'public') - - return (value, 'original') - - -# ── Discovery stubs (disabled, for future activation) ─────────────────── - -def discover_wikivoyage_article(name, category, lat, lon): - """ - Discover a related Wikivoyage article for a place. - Enabled by has_wiki_discovery. Currently returns None. - """ - return None - - -def discover_appropedia_article(name, category): - """ - Discover a related Appropedia article for a place. - Enabled by has_wiki_discovery. Currently returns None. - """ - return None diff --git a/lib/wiki_rewrite_api.py b/lib/wiki_rewrite_api.py deleted file mode 100644 index ae1d52e..0000000 --- a/lib/wiki_rewrite_api.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Wiki-rewrite API — read-only HTTP wrapper over wiki_rewrite.rewrite_wiki_link. - -Extraction #5 prep: lets the (future) navi-places service rewrite OSM wiki tags -to local Kiwix URLs over HTTP instead of importing recon's wiki_rewrite module -(which talks to Kiwix and the wiki_cache table in /opt/recon/data/place_cache.db). -Additive only — does not change place_detail's in-process `_enrich_wiki_links`. - - GET /api/wiki-rewrite?tag=&value= - -Public (no auth), matching /api/place/* and /api/wiki-enrich. 400 on missing -value or unknown tag. No 404 — an unclassifiable value returns the original -value with status "original" (mirrors rewrite_wiki_link). -""" -from flask import Blueprint, request, jsonify - -from .wiki_rewrite import rewrite_wiki_link - -wiki_rewrite_bp = Blueprint('wiki_rewrite', __name__) - -_KNOWN_TAGS = {'wikipedia', 'wikidata', 'wikivoyage', 'appropedia'} - - -@wiki_rewrite_bp.route('/api/wiki-rewrite') -def api_wiki_rewrite(): - tag = (request.args.get('tag') or '').strip().lower() - value = (request.args.get('value') or '').strip() - - if not value: - return jsonify({'error': 'value is required'}), 400 - if tag not in _KNOWN_TAGS: - return jsonify({'error': f"tag must be one of {sorted(_KNOWN_TAGS)}"}), 400 - - url, status = rewrite_wiki_link(tag, value) - return jsonify({'url': url, 'status': status}) diff --git a/lib/wiki_rewrite_api_test.py b/lib/wiki_rewrite_api_test.py deleted file mode 100644 index 2bc50f4..0000000 --- a/lib/wiki_rewrite_api_test.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Tests for the /api/wiki-rewrite endpoint (extraction #5 prep). - -Plain-assert style (recon's venv has no pytest). Builds a minimal Flask app -with only wiki_rewrite_bp registered. Mocks `wiki_rewrite.check_kiwix_has_article` -to control the local-Kiwix-hit vs. fallback paths without touching Kiwix or the -wiki_cache DB. classify_wiki_link (pure regex) runs for real. Run with pytest, -or directly: python -m lib.wiki_rewrite_api_test -""" -from flask import Flask - -from lib import wiki_rewrite -from lib.wiki_rewrite_api import wiki_rewrite_bp - - -def _client(kiwix_hit): - """kiwix_hit: (found_bool, url) returned by a stubbed check_kiwix_has_article.""" - wiki_rewrite.check_kiwix_has_article = lambda source_type, article_id: kiwix_hit - app = Flask(__name__) - app.register_blueprint(wiki_rewrite_bp) - return app.test_client() - - -def test_local_kiwix_hit(): - url = "https://wiki.echo6.co/content/wikipedia/Filer,_Idaho" - c = _client((True, url)) - resp = c.get("/api/wiki-rewrite?tag=wikipedia&value=Filer, Idaho") - assert resp.status_code == 200, resp.status_code - d = resp.get_json() - assert d["status"] == "local" - assert d["url"] == url - - -def test_public_fallback_when_not_in_kiwix(): - c = _client((False, None)) # not in Kiwix -> canonical public URL - resp = c.get("/api/wiki-rewrite?tag=wikipedia&value=Filer") - assert resp.status_code == 200, resp.status_code - d = resp.get_json() - assert d["status"] == "public" - assert d["url"] == "https://en.wikipedia.org/wiki/Filer" - - -def test_unclassifiable_returns_original(): - # 'wikidata' requires a Q-id; a non-matching value -> classify None -> original. - c = _client((False, None)) - resp = c.get("/api/wiki-rewrite?tag=wikidata&value=not-a-qid") - assert resp.status_code == 200, resp.status_code - d = resp.get_json() - assert d["status"] == "original" - assert d["url"] == "not-a-qid" - - -def test_missing_value_400(): - c = _client((False, None)) - assert c.get("/api/wiki-rewrite?tag=wikipedia").status_code == 400 - - -def test_unknown_tag_400(): - c = _client((False, None)) - assert c.get("/api/wiki-rewrite?tag=facebook&value=x").status_code == 400 - - -if __name__ == "__main__": - failures = 0 - for _name, _fn in sorted(globals().items()): - if _name.startswith("test_") and callable(_fn): - try: - _fn() - print(f"PASS {_name}") - except Exception as exc: # noqa: BLE001 - failures += 1 - print(f"FAIL {_name}: {exc!r}") - print("OK" if failures == 0 else f"{failures} FAILED") - raise SystemExit(1 if failures else 0)