mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Phase 1: Kiwix foundation — ZIM monitor and kiwix-serve setup
- Add lib/zim_monitor.py: polls kiwix-serve OPDS v2 catalog, detects new ZIMs, reads accurate article count from python-libzim Counter metadata (not inflated OPDS count), inserts into zim_sources table. Idempotent on re-run, marks removed ZIMs. - DB schema: zim_sources, zim_samples, zim_articles tables (created via sqlite3, not in migrations — matches existing RECON pattern) - kiwix-tools 3.7.0 installed from binary tarball at /opt/recon/bin/ (Ubuntu 24.04 apt ships 3.5.0 which lacks OPDS v2) - kiwix.service systemd unit on port 8430 - python-libzim 3.9.0 installed - Test ZIM: Appropedia EN maxi (496 MB, 19,445 articles) - Add bin/ to .gitignore (binary tarball, not source)
This commit is contained in:
parent
8d54ff165d
commit
7c1af0f063
2 changed files with 220 additions and 0 deletions
217
lib/zim_monitor.py
Normal file
217
lib/zim_monitor.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
"""
|
||||
ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db.
|
||||
|
||||
Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table,
|
||||
and for new ZIMs reads accurate metadata via python-libzim's Counter field.
|
||||
|
||||
Standalone: python3 /opt/recon/lib/zim_monitor.py
|
||||
As module: from lib.zim_monitor import scan_zims
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import urllib.request
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
sys.path.insert(0, "/opt/recon")
|
||||
from lib.utils import setup_logging
|
||||
|
||||
try:
|
||||
from libzim.reader import Archive
|
||||
HAVE_LIBZIM = True
|
||||
except ImportError:
|
||||
HAVE_LIBZIM = False
|
||||
|
||||
OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1"
|
||||
ZIM_DIR = "/mnt/kiwix"
|
||||
DB_PATH = "/opt/recon/data/recon.db"
|
||||
|
||||
ATOM_NS = "http://www.w3.org/2005/Atom"
|
||||
|
||||
logger = logging.getLogger("recon.zim_monitor")
|
||||
|
||||
|
||||
def _text(element, tag, ns=ATOM_NS):
|
||||
"""Get text content of a child element, or None."""
|
||||
child = element.find(f"{{{ns}}}{tag}")
|
||||
if child is not None and child.text:
|
||||
return child.text.strip()
|
||||
return None
|
||||
|
||||
|
||||
def parse_counter(counter_str):
|
||||
"""Parse ZIM Counter metadata into {mimetype: count}."""
|
||||
result = {}
|
||||
for pair in counter_str.split(";"):
|
||||
if "=" in pair:
|
||||
mime, count = pair.split("=", 1)
|
||||
try:
|
||||
result[mime.strip()] = int(count.strip())
|
||||
except ValueError:
|
||||
pass
|
||||
return result
|
||||
|
||||
|
||||
def fetch_opds():
|
||||
"""Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts."""
|
||||
try:
|
||||
with urllib.request.urlopen(OPDS_URL, timeout=10) as resp:
|
||||
data = resp.read()
|
||||
except Exception as e:
|
||||
logger.error("Failed to fetch OPDS catalog: %s", e)
|
||||
return []
|
||||
|
||||
root = ET.fromstring(data)
|
||||
entries = []
|
||||
for entry in root.findall(f"{{{ATOM_NS}}}entry"):
|
||||
uuid_raw = _text(entry, "id")
|
||||
uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None
|
||||
|
||||
# Derive ZIM filename from the content link href
|
||||
zim_filename = None
|
||||
for link in entry.findall(f"{{{ATOM_NS}}}link"):
|
||||
if link.get("type") == "text/html":
|
||||
href = link.get("href", "")
|
||||
# href looks like /content/appropedia_en_all_maxi_2025-11
|
||||
name = href.rsplit("/", 1)[-1] if "/" in href else href
|
||||
if name:
|
||||
zim_filename = name + ".zim"
|
||||
break
|
||||
|
||||
entries.append({
|
||||
"uuid": uuid,
|
||||
"title": _text(entry, "title"),
|
||||
"name": _text(entry, "name"),
|
||||
"flavour": _text(entry, "flavour"),
|
||||
"language": _text(entry, "language"),
|
||||
"category": _text(entry, "category") or None,
|
||||
"summary": _text(entry, "summary"),
|
||||
"article_count_opds": int(_text(entry, "articleCount") or 0),
|
||||
"zim_filename": zim_filename,
|
||||
})
|
||||
return entries
|
||||
|
||||
|
||||
def get_libzim_metadata(zim_path):
|
||||
"""Open a ZIM file and read accurate metadata via python-libzim."""
|
||||
if not HAVE_LIBZIM:
|
||||
logger.warning("python-libzim not available, skipping metadata read")
|
||||
return {}
|
||||
|
||||
zim = Archive(zim_path)
|
||||
meta = {}
|
||||
|
||||
def _get_meta(key):
|
||||
try:
|
||||
return zim.get_metadata(key).decode("utf-8", errors="replace")
|
||||
except RuntimeError:
|
||||
return None
|
||||
|
||||
meta["title"] = _get_meta("Title")
|
||||
meta["description"] = _get_meta("Description")
|
||||
meta["language"] = _get_meta("Language")
|
||||
meta["tags"] = _get_meta("Tags")
|
||||
|
||||
counter_str = _get_meta("Counter")
|
||||
if counter_str:
|
||||
counts = parse_counter(counter_str)
|
||||
meta["article_count"] = counts.get("text/html", 0)
|
||||
meta["counter_raw"] = counter_str
|
||||
else:
|
||||
meta["article_count"] = 0
|
||||
meta["counter_raw"] = None
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def scan_zims():
|
||||
"""Compare OPDS catalog against zim_sources table. Insert/update as needed."""
|
||||
logger.info("Scanning kiwix-serve OPDS catalog...")
|
||||
opds_entries = fetch_opds()
|
||||
if not opds_entries:
|
||||
logger.info("No entries in OPDS catalog (or fetch failed)")
|
||||
return
|
||||
|
||||
logger.info("OPDS returned %d entries", len(opds_entries))
|
||||
|
||||
con = sqlite3.connect(DB_PATH)
|
||||
con.row_factory = sqlite3.Row
|
||||
|
||||
# Get existing zim_sources keyed by filename
|
||||
existing = {}
|
||||
for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"):
|
||||
existing[row["zim_filename"]] = dict(row)
|
||||
|
||||
opds_filenames = set()
|
||||
new_count = 0
|
||||
|
||||
for entry in opds_entries:
|
||||
filename = entry["zim_filename"]
|
||||
if not filename:
|
||||
logger.warning("Skipping OPDS entry with no derivable filename: %s", entry)
|
||||
continue
|
||||
|
||||
opds_filenames.add(filename)
|
||||
|
||||
if filename in existing:
|
||||
logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"])
|
||||
continue
|
||||
|
||||
# New ZIM — read accurate metadata via python-libzim
|
||||
zim_path = os.path.join(ZIM_DIR, filename)
|
||||
if not os.path.isfile(zim_path):
|
||||
logger.warning("ZIM file not found on disk: %s", zim_path)
|
||||
continue
|
||||
|
||||
logger.info("New ZIM detected: %s — reading metadata via libzim", filename)
|
||||
meta = get_libzim_metadata(zim_path)
|
||||
|
||||
con.execute(
|
||||
"""INSERT INTO zim_sources
|
||||
(zim_filename, zim_path, zim_uuid, title, description,
|
||||
language, category, article_count, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""",
|
||||
(
|
||||
filename,
|
||||
zim_path,
|
||||
entry["uuid"],
|
||||
meta.get("title") or entry["title"],
|
||||
meta.get("description") or entry["summary"],
|
||||
meta.get("language") or entry["language"],
|
||||
entry["category"],
|
||||
meta.get("article_count", 0),
|
||||
),
|
||||
)
|
||||
new_count += 1
|
||||
logger.info(
|
||||
" Inserted: %s — title=%r, articles=%s (OPDS said %s)",
|
||||
filename,
|
||||
meta.get("title") or entry["title"],
|
||||
meta.get("article_count", 0),
|
||||
entry["article_count_opds"],
|
||||
)
|
||||
|
||||
# Detect removed ZIMs (in DB but not in OPDS, and not already marked removed)
|
||||
removed_count = 0
|
||||
for filename, row in existing.items():
|
||||
if filename not in opds_filenames and row["status"] != "removed":
|
||||
con.execute(
|
||||
"UPDATE zim_sources SET status = 'removed' WHERE id = ?",
|
||||
(row["id"],),
|
||||
)
|
||||
removed_count += 1
|
||||
logger.info("Marked removed: %s", filename)
|
||||
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
logger.info(
|
||||
"Scan complete: %d new, %d removed, %d total in catalog",
|
||||
new_count, removed_count, len(opds_entries),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup_logging("recon.zim_monitor")
|
||||
scan_zims()
|
||||
Loading…
Add table
Add a link
Reference in a new issue