recon/lib/zim_monitor.py

"""
ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db.

Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table,
and for new ZIMs reads accurate metadata via python-libzim's Counter field.

Standalone:  python3 /opt/recon/lib/zim_monitor.py
As module:   from lib.zim_monitor import scan_zims
"""
import logging
import os
import sqlite3
import sys
import urllib.request
from xml.etree import ElementTree as ET

sys.path.insert(0, "/opt/recon")
from lib.utils import setup_logging

try:
    from libzim.reader import Archive
    HAVE_LIBZIM = True
except ImportError:
    HAVE_LIBZIM = False

OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1"
ZIM_DIR = "/mnt/kiwix"
DB_PATH = "/opt/recon/data/recon.db"

ATOM_NS = "http://www.w3.org/2005/Atom"

logger = logging.getLogger("recon.zim_monitor")


def _text(element, tag, ns=ATOM_NS):
    """Get text content of a child element, or None."""
    child = element.find(f"{{{ns}}}{tag}")
    if child is not None and child.text:
        return child.text.strip()
    return None


def parse_counter(counter_str):
    """Parse ZIM Counter metadata into {mimetype: count}."""
    result = {}
    for pair in counter_str.split(";"):
        if "=" in pair:
            mime, count = pair.split("=", 1)
            try:
                result[mime.strip()] = int(count.strip())
            except ValueError:
                pass
    return result


def fetch_opds():
    """Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts."""
    try:
        with urllib.request.urlopen(OPDS_URL, timeout=10) as resp:
            data = resp.read()
    except Exception as e:
        logger.error("Failed to fetch OPDS catalog: %s", e)
        return []

    root = ET.fromstring(data)
    entries = []
    for entry in root.findall(f"{{{ATOM_NS}}}entry"):
        uuid_raw = _text(entry, "id")
        uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None

        # Derive ZIM filename from the content link href
        zim_filename = None
        for link in entry.findall(f"{{{ATOM_NS}}}link"):
            if link.get("type") == "text/html":
                href = link.get("href", "")
                # href looks like /content/appropedia_en_all_maxi_2025-11
                name = href.rsplit("/", 1)[-1] if "/" in href else href
                if name:
                    zim_filename = name + ".zim"
                break

        entries.append({
            "uuid": uuid,
            "title": _text(entry, "title"),
            "name": _text(entry, "name"),
            "flavour": _text(entry, "flavour"),
            "language": _text(entry, "language"),
            "category": _text(entry, "category") or None,
            "summary": _text(entry, "summary"),
            "article_count_opds": int(_text(entry, "articleCount") or 0),
            "zim_filename": zim_filename,
        })
    return entries


def get_libzim_metadata(zim_path):
    """Open a ZIM file and read accurate metadata via python-libzim."""
    if not HAVE_LIBZIM:
        logger.warning("python-libzim not available, skipping metadata read")
        return {}

    zim = Archive(zim_path)
    meta = {}

    def _get_meta(key):
        try:
            return zim.get_metadata(key).decode("utf-8", errors="replace")
        except RuntimeError:
            return None

    meta["title"] = _get_meta("Title")
    meta["description"] = _get_meta("Description")
    meta["language"] = _get_meta("Language")
    meta["tags"] = _get_meta("Tags")

    counter_str = _get_meta("Counter")
    if counter_str:
        counts = parse_counter(counter_str)
        meta["article_count"] = counts.get("text/html", 0)
        meta["counter_raw"] = counter_str
    else:
        meta["article_count"] = 0
        meta["counter_raw"] = None

    return meta


def scan_zims():
    """Compare OPDS catalog against zim_sources table. Insert/update as needed."""
    logger.info("Scanning kiwix-serve OPDS catalog...")
    opds_entries = fetch_opds()
    if not opds_entries:
        logger.info("No entries in OPDS catalog (or fetch failed)")
        return

    logger.info("OPDS returned %d entries", len(opds_entries))

    con = sqlite3.connect(DB_PATH)
    con.row_factory = sqlite3.Row

    # Get existing zim_sources keyed by filename
    existing = {}
    for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"):
        existing[row["zim_filename"]] = dict(row)

    opds_filenames = set()
    new_count = 0

    for entry in opds_entries:
        filename = entry["zim_filename"]
        if not filename:
            logger.warning("Skipping OPDS entry with no derivable filename: %s", entry)
            continue

        opds_filenames.add(filename)

        if filename in existing:
            logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"])
            continue

        # New ZIM — read accurate metadata via python-libzim
        zim_path = os.path.join(ZIM_DIR, filename)
        if not os.path.isfile(zim_path):
            logger.warning("ZIM file not found on disk: %s", zim_path)
            continue

        logger.info("New ZIM detected: %s — reading metadata via libzim", filename)
        meta = get_libzim_metadata(zim_path)

        con.execute(
            """INSERT INTO zim_sources
               (zim_filename, zim_path, zim_uuid, title, description,
                language, category, article_count, status)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""",
            (
                filename,
                zim_path,
                entry["uuid"],
                meta.get("title") or entry["title"],
                meta.get("description") or entry["summary"],
                meta.get("language") or entry["language"],
                entry["category"],
                meta.get("article_count", 0),
            ),
        )
        new_count += 1
        logger.info(
            "  Inserted: %s — title=%r, articles=%s (OPDS said %s)",
            filename,
            meta.get("title") or entry["title"],
            meta.get("article_count", 0),
            entry["article_count_opds"],
        )

    # Detect removed ZIMs (in DB but not in OPDS, and not already marked removed)
    removed_count = 0
    for filename, row in existing.items():
        if filename not in opds_filenames and row["status"] != "removed":
            con.execute(
                "UPDATE zim_sources SET status = 'removed' WHERE id = ?",
                (row["id"],),
            )
            removed_count += 1
            logger.info("Marked removed: %s", filename)

    con.commit()
    con.close()

    logger.info(
        "Scan complete: %d new, %d removed, %d total in catalog",
        new_count, removed_count, len(opds_entries),
    )


if __name__ == "__main__":
    setup_logging("recon.zim_monitor")
    scan_zims()