mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
- Add lib/zim_monitor.py: polls kiwix-serve OPDS v2 catalog, detects new ZIMs, reads accurate article count from python-libzim Counter metadata (not inflated OPDS count), inserts into zim_sources table. Idempotent on re-run, marks removed ZIMs. - DB schema: zim_sources, zim_samples, zim_articles tables (created via sqlite3, not in migrations — matches existing RECON pattern) - kiwix-tools 3.7.0 installed from binary tarball at /opt/recon/bin/ (Ubuntu 24.04 apt ships 3.5.0 which lacks OPDS v2) - kiwix.service systemd unit on port 8430 - python-libzim 3.9.0 installed - Test ZIM: Appropedia EN maxi (496 MB, 19,445 articles) - Add bin/ to .gitignore (binary tarball, not source)
217 lines
6.8 KiB
Python
217 lines
6.8 KiB
Python
"""
|
|
ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db.
|
|
|
|
Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table,
|
|
and for new ZIMs reads accurate metadata via python-libzim's Counter field.
|
|
|
|
Standalone: python3 /opt/recon/lib/zim_monitor.py
|
|
As module: from lib.zim_monitor import scan_zims
|
|
"""
|
|
import logging
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
import urllib.request
|
|
from xml.etree import ElementTree as ET
|
|
|
|
sys.path.insert(0, "/opt/recon")
|
|
from lib.utils import setup_logging
|
|
|
|
try:
|
|
from libzim.reader import Archive
|
|
HAVE_LIBZIM = True
|
|
except ImportError:
|
|
HAVE_LIBZIM = False
|
|
|
|
OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1"
|
|
ZIM_DIR = "/mnt/kiwix"
|
|
DB_PATH = "/opt/recon/data/recon.db"
|
|
|
|
ATOM_NS = "http://www.w3.org/2005/Atom"
|
|
|
|
logger = logging.getLogger("recon.zim_monitor")
|
|
|
|
|
|
def _text(element, tag, ns=ATOM_NS):
|
|
"""Get text content of a child element, or None."""
|
|
child = element.find(f"{{{ns}}}{tag}")
|
|
if child is not None and child.text:
|
|
return child.text.strip()
|
|
return None
|
|
|
|
|
|
def parse_counter(counter_str):
|
|
"""Parse ZIM Counter metadata into {mimetype: count}."""
|
|
result = {}
|
|
for pair in counter_str.split(";"):
|
|
if "=" in pair:
|
|
mime, count = pair.split("=", 1)
|
|
try:
|
|
result[mime.strip()] = int(count.strip())
|
|
except ValueError:
|
|
pass
|
|
return result
|
|
|
|
|
|
def fetch_opds():
|
|
"""Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts."""
|
|
try:
|
|
with urllib.request.urlopen(OPDS_URL, timeout=10) as resp:
|
|
data = resp.read()
|
|
except Exception as e:
|
|
logger.error("Failed to fetch OPDS catalog: %s", e)
|
|
return []
|
|
|
|
root = ET.fromstring(data)
|
|
entries = []
|
|
for entry in root.findall(f"{{{ATOM_NS}}}entry"):
|
|
uuid_raw = _text(entry, "id")
|
|
uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None
|
|
|
|
# Derive ZIM filename from the content link href
|
|
zim_filename = None
|
|
for link in entry.findall(f"{{{ATOM_NS}}}link"):
|
|
if link.get("type") == "text/html":
|
|
href = link.get("href", "")
|
|
# href looks like /content/appropedia_en_all_maxi_2025-11
|
|
name = href.rsplit("/", 1)[-1] if "/" in href else href
|
|
if name:
|
|
zim_filename = name + ".zim"
|
|
break
|
|
|
|
entries.append({
|
|
"uuid": uuid,
|
|
"title": _text(entry, "title"),
|
|
"name": _text(entry, "name"),
|
|
"flavour": _text(entry, "flavour"),
|
|
"language": _text(entry, "language"),
|
|
"category": _text(entry, "category") or None,
|
|
"summary": _text(entry, "summary"),
|
|
"article_count_opds": int(_text(entry, "articleCount") or 0),
|
|
"zim_filename": zim_filename,
|
|
})
|
|
return entries
|
|
|
|
|
|
def get_libzim_metadata(zim_path):
|
|
"""Open a ZIM file and read accurate metadata via python-libzim."""
|
|
if not HAVE_LIBZIM:
|
|
logger.warning("python-libzim not available, skipping metadata read")
|
|
return {}
|
|
|
|
zim = Archive(zim_path)
|
|
meta = {}
|
|
|
|
def _get_meta(key):
|
|
try:
|
|
return zim.get_metadata(key).decode("utf-8", errors="replace")
|
|
except RuntimeError:
|
|
return None
|
|
|
|
meta["title"] = _get_meta("Title")
|
|
meta["description"] = _get_meta("Description")
|
|
meta["language"] = _get_meta("Language")
|
|
meta["tags"] = _get_meta("Tags")
|
|
|
|
counter_str = _get_meta("Counter")
|
|
if counter_str:
|
|
counts = parse_counter(counter_str)
|
|
meta["article_count"] = counts.get("text/html", 0)
|
|
meta["counter_raw"] = counter_str
|
|
else:
|
|
meta["article_count"] = 0
|
|
meta["counter_raw"] = None
|
|
|
|
return meta
|
|
|
|
|
|
def scan_zims():
|
|
"""Compare OPDS catalog against zim_sources table. Insert/update as needed."""
|
|
logger.info("Scanning kiwix-serve OPDS catalog...")
|
|
opds_entries = fetch_opds()
|
|
if not opds_entries:
|
|
logger.info("No entries in OPDS catalog (or fetch failed)")
|
|
return
|
|
|
|
logger.info("OPDS returned %d entries", len(opds_entries))
|
|
|
|
con = sqlite3.connect(DB_PATH)
|
|
con.row_factory = sqlite3.Row
|
|
|
|
# Get existing zim_sources keyed by filename
|
|
existing = {}
|
|
for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"):
|
|
existing[row["zim_filename"]] = dict(row)
|
|
|
|
opds_filenames = set()
|
|
new_count = 0
|
|
|
|
for entry in opds_entries:
|
|
filename = entry["zim_filename"]
|
|
if not filename:
|
|
logger.warning("Skipping OPDS entry with no derivable filename: %s", entry)
|
|
continue
|
|
|
|
opds_filenames.add(filename)
|
|
|
|
if filename in existing:
|
|
logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"])
|
|
continue
|
|
|
|
# New ZIM — read accurate metadata via python-libzim
|
|
zim_path = os.path.join(ZIM_DIR, filename)
|
|
if not os.path.isfile(zim_path):
|
|
logger.warning("ZIM file not found on disk: %s", zim_path)
|
|
continue
|
|
|
|
logger.info("New ZIM detected: %s — reading metadata via libzim", filename)
|
|
meta = get_libzim_metadata(zim_path)
|
|
|
|
con.execute(
|
|
"""INSERT INTO zim_sources
|
|
(zim_filename, zim_path, zim_uuid, title, description,
|
|
language, category, article_count, status)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""",
|
|
(
|
|
filename,
|
|
zim_path,
|
|
entry["uuid"],
|
|
meta.get("title") or entry["title"],
|
|
meta.get("description") or entry["summary"],
|
|
meta.get("language") or entry["language"],
|
|
entry["category"],
|
|
meta.get("article_count", 0),
|
|
),
|
|
)
|
|
new_count += 1
|
|
logger.info(
|
|
" Inserted: %s — title=%r, articles=%s (OPDS said %s)",
|
|
filename,
|
|
meta.get("title") or entry["title"],
|
|
meta.get("article_count", 0),
|
|
entry["article_count_opds"],
|
|
)
|
|
|
|
# Detect removed ZIMs (in DB but not in OPDS, and not already marked removed)
|
|
removed_count = 0
|
|
for filename, row in existing.items():
|
|
if filename not in opds_filenames and row["status"] != "removed":
|
|
con.execute(
|
|
"UPDATE zim_sources SET status = 'removed' WHERE id = ?",
|
|
(row["id"],),
|
|
)
|
|
removed_count += 1
|
|
logger.info("Marked removed: %s", filename)
|
|
|
|
con.commit()
|
|
con.close()
|
|
|
|
logger.info(
|
|
"Scan complete: %d new, %d removed, %d total in catalog",
|
|
new_count, removed_count, len(opds_entries),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
setup_logging("recon.zim_monitor")
|
|
scan_zims()
|