recon/lib/zim_monitor.py
Matt 7c1af0f063 Phase 1: Kiwix foundation — ZIM monitor and kiwix-serve setup
- Add lib/zim_monitor.py: polls kiwix-serve OPDS v2 catalog, detects
  new ZIMs, reads accurate article count from python-libzim Counter
  metadata (not inflated OPDS count), inserts into zim_sources table.
  Idempotent on re-run, marks removed ZIMs.
- DB schema: zim_sources, zim_samples, zim_articles tables (created
  via sqlite3, not in migrations — matches existing RECON pattern)
- kiwix-tools 3.7.0 installed from binary tarball at /opt/recon/bin/
  (Ubuntu 24.04 apt ships 3.5.0 which lacks OPDS v2)
- kiwix.service systemd unit on port 8430
- python-libzim 3.9.0 installed
- Test ZIM: Appropedia EN maxi (496 MB, 19,445 articles)
- Add bin/ to .gitignore (binary tarball, not source)
2026-04-16 23:39:34 +00:00

217 lines
6.8 KiB
Python

"""
ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db.
Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table,
and for new ZIMs reads accurate metadata via python-libzim's Counter field.
Standalone: python3 /opt/recon/lib/zim_monitor.py
As module: from lib.zim_monitor import scan_zims
"""
import logging
import os
import sqlite3
import sys
import urllib.request
from xml.etree import ElementTree as ET
sys.path.insert(0, "/opt/recon")
from lib.utils import setup_logging
try:
from libzim.reader import Archive
HAVE_LIBZIM = True
except ImportError:
HAVE_LIBZIM = False
OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1"
ZIM_DIR = "/mnt/kiwix"
DB_PATH = "/opt/recon/data/recon.db"
ATOM_NS = "http://www.w3.org/2005/Atom"
logger = logging.getLogger("recon.zim_monitor")
def _text(element, tag, ns=ATOM_NS):
"""Get text content of a child element, or None."""
child = element.find(f"{{{ns}}}{tag}")
if child is not None and child.text:
return child.text.strip()
return None
def parse_counter(counter_str):
"""Parse ZIM Counter metadata into {mimetype: count}."""
result = {}
for pair in counter_str.split(";"):
if "=" in pair:
mime, count = pair.split("=", 1)
try:
result[mime.strip()] = int(count.strip())
except ValueError:
pass
return result
def fetch_opds():
"""Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts."""
try:
with urllib.request.urlopen(OPDS_URL, timeout=10) as resp:
data = resp.read()
except Exception as e:
logger.error("Failed to fetch OPDS catalog: %s", e)
return []
root = ET.fromstring(data)
entries = []
for entry in root.findall(f"{{{ATOM_NS}}}entry"):
uuid_raw = _text(entry, "id")
uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None
# Derive ZIM filename from the content link href
zim_filename = None
for link in entry.findall(f"{{{ATOM_NS}}}link"):
if link.get("type") == "text/html":
href = link.get("href", "")
# href looks like /content/appropedia_en_all_maxi_2025-11
name = href.rsplit("/", 1)[-1] if "/" in href else href
if name:
zim_filename = name + ".zim"
break
entries.append({
"uuid": uuid,
"title": _text(entry, "title"),
"name": _text(entry, "name"),
"flavour": _text(entry, "flavour"),
"language": _text(entry, "language"),
"category": _text(entry, "category") or None,
"summary": _text(entry, "summary"),
"article_count_opds": int(_text(entry, "articleCount") or 0),
"zim_filename": zim_filename,
})
return entries
def get_libzim_metadata(zim_path):
"""Open a ZIM file and read accurate metadata via python-libzim."""
if not HAVE_LIBZIM:
logger.warning("python-libzim not available, skipping metadata read")
return {}
zim = Archive(zim_path)
meta = {}
def _get_meta(key):
try:
return zim.get_metadata(key).decode("utf-8", errors="replace")
except RuntimeError:
return None
meta["title"] = _get_meta("Title")
meta["description"] = _get_meta("Description")
meta["language"] = _get_meta("Language")
meta["tags"] = _get_meta("Tags")
counter_str = _get_meta("Counter")
if counter_str:
counts = parse_counter(counter_str)
meta["article_count"] = counts.get("text/html", 0)
meta["counter_raw"] = counter_str
else:
meta["article_count"] = 0
meta["counter_raw"] = None
return meta
def scan_zims():
"""Compare OPDS catalog against zim_sources table. Insert/update as needed."""
logger.info("Scanning kiwix-serve OPDS catalog...")
opds_entries = fetch_opds()
if not opds_entries:
logger.info("No entries in OPDS catalog (or fetch failed)")
return
logger.info("OPDS returned %d entries", len(opds_entries))
con = sqlite3.connect(DB_PATH)
con.row_factory = sqlite3.Row
# Get existing zim_sources keyed by filename
existing = {}
for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"):
existing[row["zim_filename"]] = dict(row)
opds_filenames = set()
new_count = 0
for entry in opds_entries:
filename = entry["zim_filename"]
if not filename:
logger.warning("Skipping OPDS entry with no derivable filename: %s", entry)
continue
opds_filenames.add(filename)
if filename in existing:
logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"])
continue
# New ZIM — read accurate metadata via python-libzim
zim_path = os.path.join(ZIM_DIR, filename)
if not os.path.isfile(zim_path):
logger.warning("ZIM file not found on disk: %s", zim_path)
continue
logger.info("New ZIM detected: %s — reading metadata via libzim", filename)
meta = get_libzim_metadata(zim_path)
con.execute(
"""INSERT INTO zim_sources
(zim_filename, zim_path, zim_uuid, title, description,
language, category, article_count, status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""",
(
filename,
zim_path,
entry["uuid"],
meta.get("title") or entry["title"],
meta.get("description") or entry["summary"],
meta.get("language") or entry["language"],
entry["category"],
meta.get("article_count", 0),
),
)
new_count += 1
logger.info(
" Inserted: %s — title=%r, articles=%s (OPDS said %s)",
filename,
meta.get("title") or entry["title"],
meta.get("article_count", 0),
entry["article_count_opds"],
)
# Detect removed ZIMs (in DB but not in OPDS, and not already marked removed)
removed_count = 0
for filename, row in existing.items():
if filename not in opds_filenames and row["status"] != "removed":
con.execute(
"UPDATE zim_sources SET status = 'removed' WHERE id = ?",
(row["id"],),
)
removed_count += 1
logger.info("Marked removed: %s", filename)
con.commit()
con.close()
logger.info(
"Scan complete: %d new, %d removed, %d total in catalog",
new_count, removed_count, len(opds_entries),
)
if __name__ == "__main__":
setup_logging("recon.zim_monitor")
scan_zims()