From 7c1af0f06317b46571f61ebe2e95d196e6a0ea51 Mon Sep 17 00:00:00 2001 From: Matt Date: Thu, 16 Apr 2026 23:39:34 +0000 Subject: [PATCH] =?UTF-8?q?Phase=201:=20Kiwix=20foundation=20=E2=80=94=20Z?= =?UTF-8?q?IM=20monitor=20and=20kiwix-serve=20setup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add lib/zim_monitor.py: polls kiwix-serve OPDS v2 catalog, detects new ZIMs, reads accurate article count from python-libzim Counter metadata (not inflated OPDS count), inserts into zim_sources table. Idempotent on re-run, marks removed ZIMs. - DB schema: zim_sources, zim_samples, zim_articles tables (created via sqlite3, not in migrations — matches existing RECON pattern) - kiwix-tools 3.7.0 installed from binary tarball at /opt/recon/bin/ (Ubuntu 24.04 apt ships 3.5.0 which lacks OPDS v2) - kiwix.service systemd unit on port 8430 - python-libzim 3.9.0 installed - Test ZIM: Appropedia EN maxi (496 MB, 19,445 articles) - Add bin/ to .gitignore (binary tarball, not source) --- .gitignore | 3 + lib/zim_monitor.py | 217 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 lib/zim_monitor.py diff --git a/.gitignore b/.gitignore index 238cabb..3fb01ef 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ recon.db # OS .DS_Store + +# Kiwix binary tools (installed from tarball) +bin/ diff --git a/lib/zim_monitor.py b/lib/zim_monitor.py new file mode 100644 index 0000000..248fc0f --- /dev/null +++ b/lib/zim_monitor.py @@ -0,0 +1,217 @@ +""" +ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db. + +Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table, +and for new ZIMs reads accurate metadata via python-libzim's Counter field. + +Standalone: python3 /opt/recon/lib/zim_monitor.py +As module: from lib.zim_monitor import scan_zims +""" +import logging +import os +import sqlite3 +import sys +import urllib.request +from xml.etree import ElementTree as ET + +sys.path.insert(0, "/opt/recon") +from lib.utils import setup_logging + +try: + from libzim.reader import Archive + HAVE_LIBZIM = True +except ImportError: + HAVE_LIBZIM = False + +OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1" +ZIM_DIR = "/mnt/kiwix" +DB_PATH = "/opt/recon/data/recon.db" + +ATOM_NS = "http://www.w3.org/2005/Atom" + +logger = logging.getLogger("recon.zim_monitor") + + +def _text(element, tag, ns=ATOM_NS): + """Get text content of a child element, or None.""" + child = element.find(f"{{{ns}}}{tag}") + if child is not None and child.text: + return child.text.strip() + return None + + +def parse_counter(counter_str): + """Parse ZIM Counter metadata into {mimetype: count}.""" + result = {} + for pair in counter_str.split(";"): + if "=" in pair: + mime, count = pair.split("=", 1) + try: + result[mime.strip()] = int(count.strip()) + except ValueError: + pass + return result + + +def fetch_opds(): + """Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts.""" + try: + with urllib.request.urlopen(OPDS_URL, timeout=10) as resp: + data = resp.read() + except Exception as e: + logger.error("Failed to fetch OPDS catalog: %s", e) + return [] + + root = ET.fromstring(data) + entries = [] + for entry in root.findall(f"{{{ATOM_NS}}}entry"): + uuid_raw = _text(entry, "id") + uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None + + # Derive ZIM filename from the content link href + zim_filename = None + for link in entry.findall(f"{{{ATOM_NS}}}link"): + if link.get("type") == "text/html": + href = link.get("href", "") + # href looks like /content/appropedia_en_all_maxi_2025-11 + name = href.rsplit("/", 1)[-1] if "/" in href else href + if name: + zim_filename = name + ".zim" + break + + entries.append({ + "uuid": uuid, + "title": _text(entry, "title"), + "name": _text(entry, "name"), + "flavour": _text(entry, "flavour"), + "language": _text(entry, "language"), + "category": _text(entry, "category") or None, + "summary": _text(entry, "summary"), + "article_count_opds": int(_text(entry, "articleCount") or 0), + "zim_filename": zim_filename, + }) + return entries + + +def get_libzim_metadata(zim_path): + """Open a ZIM file and read accurate metadata via python-libzim.""" + if not HAVE_LIBZIM: + logger.warning("python-libzim not available, skipping metadata read") + return {} + + zim = Archive(zim_path) + meta = {} + + def _get_meta(key): + try: + return zim.get_metadata(key).decode("utf-8", errors="replace") + except RuntimeError: + return None + + meta["title"] = _get_meta("Title") + meta["description"] = _get_meta("Description") + meta["language"] = _get_meta("Language") + meta["tags"] = _get_meta("Tags") + + counter_str = _get_meta("Counter") + if counter_str: + counts = parse_counter(counter_str) + meta["article_count"] = counts.get("text/html", 0) + meta["counter_raw"] = counter_str + else: + meta["article_count"] = 0 + meta["counter_raw"] = None + + return meta + + +def scan_zims(): + """Compare OPDS catalog against zim_sources table. Insert/update as needed.""" + logger.info("Scanning kiwix-serve OPDS catalog...") + opds_entries = fetch_opds() + if not opds_entries: + logger.info("No entries in OPDS catalog (or fetch failed)") + return + + logger.info("OPDS returned %d entries", len(opds_entries)) + + con = sqlite3.connect(DB_PATH) + con.row_factory = sqlite3.Row + + # Get existing zim_sources keyed by filename + existing = {} + for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"): + existing[row["zim_filename"]] = dict(row) + + opds_filenames = set() + new_count = 0 + + for entry in opds_entries: + filename = entry["zim_filename"] + if not filename: + logger.warning("Skipping OPDS entry with no derivable filename: %s", entry) + continue + + opds_filenames.add(filename) + + if filename in existing: + logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"]) + continue + + # New ZIM — read accurate metadata via python-libzim + zim_path = os.path.join(ZIM_DIR, filename) + if not os.path.isfile(zim_path): + logger.warning("ZIM file not found on disk: %s", zim_path) + continue + + logger.info("New ZIM detected: %s — reading metadata via libzim", filename) + meta = get_libzim_metadata(zim_path) + + con.execute( + """INSERT INTO zim_sources + (zim_filename, zim_path, zim_uuid, title, description, + language, category, article_count, status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""", + ( + filename, + zim_path, + entry["uuid"], + meta.get("title") or entry["title"], + meta.get("description") or entry["summary"], + meta.get("language") or entry["language"], + entry["category"], + meta.get("article_count", 0), + ), + ) + new_count += 1 + logger.info( + " Inserted: %s — title=%r, articles=%s (OPDS said %s)", + filename, + meta.get("title") or entry["title"], + meta.get("article_count", 0), + entry["article_count_opds"], + ) + + # Detect removed ZIMs (in DB but not in OPDS, and not already marked removed) + removed_count = 0 + for filename, row in existing.items(): + if filename not in opds_filenames and row["status"] != "removed": + con.execute( + "UPDATE zim_sources SET status = 'removed' WHERE id = ?", + (row["id"],), + ) + removed_count += 1 + logger.info("Marked removed: %s", filename) + + con.commit() + con.close() + + logger.info( + "Scan complete: %d new, %d removed, %d total in catalog", + new_count, removed_count, len(opds_entries), + ) + + +if __name__ == "__main__": + setup_logging("recon.zim_monitor") + scan_zims()