Kiwix integration: ZIM processor, dashboard tab, wiki.echo6.co citations

- ZIM processor: extract articles from ZIM files, feed into existing enrichment pipeline
- Dashboard: Kiwix tab with library table, ingest toggle, upload, remove
- kiwix-serve on port 8430, wiki.echo6.co behind Authentik
- Citation URLs point to wiki.echo6.co/{zimname}/{article_path}
- Dashboard shows WIKI type badge for ZIM-sourced content
- Appropedia EN (19,445 articles) fully ingested as proof of concept
This commit is contained in:
Matt 2026-04-17 07:00:24 +00:00
commit 2635160887
7 changed files with 521 additions and 3 deletions

View file

@ -35,12 +35,15 @@ _cache = {
'qdrant_scroll': None,
'qdrant_scroll_ts': 0,
'quick_stats': None,
'kiwix_sources': None,
}
app = Flask(__name__,
template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'),
static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static'))
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
# ── Navigation Constants ──
KNOWLEDGE_SUBNAV = [
@ -56,6 +59,8 @@ PEERTUBE_SUBNAV = [
{'href': '/peertube/channels', 'label': 'Channels'},
]
KIWIX_SUBNAV = [] # Single-page, no subnav needed
SETTINGS_SUBNAV = [
{'href': '/settings/keys', 'label': 'API Keys'},
{'href': '/settings/cookies', 'label': 'YouTube Cookies'},
@ -908,6 +913,7 @@ def _build_knowledge_stats():
c.source,
CASE
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
WHEN c.source = 'kiwix' THEN 'wiki'
WHEN c.path LIKE 'http%' THEN 'web'
ELSE 'pdf'
END as type,
@ -967,6 +973,7 @@ def _build_knowledge_stats():
d.status, d.concepts_extracted, d.vectors_inserted,
CASE
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
WHEN c.source = 'kiwix' THEN 'wiki'
WHEN d.path LIKE 'http%' THEN 'web'
ELSE 'pdf'
END as type
@ -1072,6 +1079,12 @@ def start_cache_warmer(stop_event=None):
except Exception as e:
logger.warning(f" Quick stats warm-up failed: {e}")
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
logger.info(" Kiwix sources cached")
except Exception as e:
logger.warning(f" Kiwix sources warm-up failed: {e}")
logger.info("Cache warmer ready — all data pre-loaded")
# Continuous refresh loop
@ -1098,6 +1111,10 @@ def start_cache_warmer(stop_event=None):
_cache['quick_stats'] = _build_quick_stats()
except Exception:
pass
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
except Exception:
pass
# PeerTube dashboard: every 30s (cycle 2, offset)
if cycle % 2 == 1:
@ -1930,6 +1947,297 @@ def api_peertube_dashboard():
return jsonify(_cache['pt_dashboard'])
# ── Kiwix Dashboard ──
@app.route('/kiwix')
def kiwix_dashboard():
return render_template('kiwix/dashboard.html',
domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix')
@app.route('/api/kiwix/sources')
def api_kiwix_sources():
"""Serve pre-cached Kiwix sources data (never blocks)."""
if _cache['kiwix_sources'] is None:
return jsonify({'error': 'Warming up, try again in a few seconds'}), 503
return jsonify(_cache['kiwix_sources'])
@app.route('/api/kiwix/toggle-ingest/<int:source_id>', methods=['POST'])
def api_kiwix_toggle_ingest(source_id):
"""Toggle ingest_enabled on a ZIM source."""
db = StatusDB()
conn = db._get_conn()
row = conn.execute("SELECT id, status, ingest_enabled FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
if not row:
return jsonify({'error': 'Source not found'}), 404
data = request.get_json(silent=True) or {}
new_val = 1 if data.get('enabled', not row['ingest_enabled']) else 0
conn.execute("UPDATE zim_sources SET ingest_enabled = ? WHERE id = ?", (new_val, source_id))
conn.commit()
# If toggling ON and source is eligible, spawn ingest in background
if new_val == 1 and row['status'] == 'detected':
_spawn_zim_ingest(source_id)
return jsonify({'ok': True, 'ingest_enabled': new_val})
@app.route('/api/kiwix/trigger-ingest/<int:source_id>', methods=['POST'])
def api_kiwix_trigger_ingest(source_id):
"""Explicit one-shot ingest trigger."""
db = StatusDB()
conn = db._get_conn()
row = conn.execute("SELECT id FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
if not row:
return jsonify({'error': 'Source not found'}), 404
_spawn_zim_ingest(source_id)
return jsonify({'ok': True})
@app.route('/api/kiwix/upload', methods=['POST'])
def api_kiwix_upload():
"""Accept ZIM file upload, register with kiwix-serve, scan."""
import subprocess
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
f = request.files['file']
if not f.filename or not f.filename.endswith('.zim'):
return jsonify({'error': 'File must be a .zim file'}), 400
filename = secure_filename(f.filename)
dest = os.path.join('/mnt/kiwix', filename)
tmp_dest = dest + '.tmp'
try:
f.save(tmp_dest)
os.rename(tmp_dest, dest)
except Exception as e:
if os.path.exists(tmp_dest):
os.remove(tmp_dest)
return jsonify({'error': f'Save failed: {e}'}), 500
# Register with kiwix-serve library
try:
subprocess.run(
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'add', dest],
capture_output=True, text=True, timeout=30
)
except Exception as e:
logger.warning(f"kiwix-manage add failed: {e}")
# Scan for new entry
try:
from .zim_monitor import scan_zims
scan_zims()
except Exception as e:
logger.warning(f"scan_zims after upload failed: {e}")
# Refresh cache
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
except Exception:
pass
return jsonify({'ok': True, 'filename': filename})
@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST'])
def api_kiwix_remove(source_id):
"""Remove a ZIM source: delete vectors, DB records, library entry, and file."""
import subprocess
import requests as req
db = StatusDB()
conn = db._get_conn()
row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
if not row:
return jsonify({'error': 'Source not found'}), 404
zim_source = dict(row)
zim_filename = zim_source['zim_filename']
zim_path = zim_source['zim_path']
zim_title = zim_source.get('title', zim_filename)
results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False}
# Step 1: Find all document hashes for this ZIM source
doc_hashes = [r['hash'] for r in conn.execute(
"SELECT c.hash FROM catalogue c WHERE c.source = 'kiwix' AND c.category = ?",
(zim_title,)
).fetchall()]
# Step 2: Delete vectors from Qdrant
if doc_hashes:
config = get_config()
qdrant_host = config.get('vector_db', {}).get('host', '100.64.0.14')
qdrant_port = config.get('vector_db', {}).get('port', 6333)
collection = config.get('vector_db', {}).get('collection', 'recon_knowledge')
# Delete in batches of 100 hashes
for i in range(0, len(doc_hashes), 100):
batch = doc_hashes[i:i+100]
try:
resp = req.post(
f"http://{qdrant_host}:{qdrant_port}/collections/{collection}/points/delete",
json={
"filter": {
"must": [{
"key": "doc_hash",
"match": {"any": batch}
}]
}
},
timeout=30
)
if resp.status_code == 200:
results['vectors_deleted'] += len(batch)
except Exception as e:
logger.warning(f"Qdrant delete batch failed: {e}")
# Step 3: Delete DB records
for h in doc_hashes:
# Delete processing directory if it exists
text_dir_row = conn.execute("SELECT text_dir FROM documents WHERE hash = ?", (h,)).fetchone()
if text_dir_row and text_dir_row['text_dir']:
try:
import shutil
shutil.rmtree(text_dir_row['text_dir'], ignore_errors=True)
except Exception:
pass
conn.execute("DELETE FROM documents WHERE hash = ?", (h,))
conn.execute("DELETE FROM catalogue WHERE hash = ?", (h,))
results['docs_deleted'] = len(doc_hashes)
# Delete zim_articles records
conn.execute("DELETE FROM zim_articles WHERE zim_source_id = ?", (source_id,))
# Delete zim_sources record
conn.execute("DELETE FROM zim_sources WHERE id = ?", (source_id,))
conn.commit()
# Step 4: Remove from kiwix-serve library
try:
# Get the book ID from library.xml
subprocess.run(
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')],
capture_output=True, text=True, timeout=10
)
except Exception as e:
logger.warning(f"kiwix-manage remove failed: {e}")
# Step 5: Delete the ZIM file
if os.path.isfile(zim_path):
try:
os.remove(zim_path)
results['file_deleted'] = True
except Exception as e:
logger.warning(f"ZIM file delete failed: {e}")
results['file_deleted'] = False
# Refresh cache
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
except Exception:
pass
logger.info(f"Removed ZIM source '{zim_title}': {results}")
return jsonify({'ok': True, 'results': results})
def _spawn_zim_ingest(source_id):
"""Start ZIM ingestion in a background thread."""
def _run():
try:
from .processors.zim_processor import ingest_zim
config = get_config()
db = StatusDB()
logger.info(f"Starting ZIM ingest for source {source_id}")
result = ingest_zim(source_id, db, config)
logger.info(f"ZIM ingest complete for source {source_id}: {result}")
# Refresh cache after completion
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
except Exception:
pass
except Exception as e:
logger.error(f"ZIM ingest failed for source {source_id}: {e}")
t = threading.Thread(target=_run, daemon=True, name=f'zim-ingest-{source_id}')
t.start()
def _build_kiwix_sources():
"""Build Kiwix sources data for the dashboard cache."""
import urllib.request
db = StatusDB()
conn = db._get_conn()
# Get all ZIM sources
rows = conn.execute("""
SELECT id, zim_filename, title, description, language, category,
article_count, status, processed_count, skipped_count, error_count,
ingest_enabled, detected_at, started_at, completed_at
FROM zim_sources
ORDER BY detected_at DESC
""").fetchall()
sources = []
total_articles = 0
total_processed = 0
total_in_pipeline = 0
for r in rows:
source = dict(r)
total_articles += r['article_count'] or 0
total_processed += r['processed_count'] or 0
# Get pipeline stats for this source's documents
pipeline = {}
try:
pipe_rows = conn.execute("""
SELECT d.status, COUNT(*) as cnt
FROM documents d
JOIN catalogue c ON d.hash = c.hash
WHERE c.source = 'kiwix'
GROUP BY d.status
""").fetchall()
for pr in pipe_rows:
pipeline[pr['status']] = pr['cnt']
except Exception:
pass
in_pipe = sum(v for k, v in pipeline.items() if k not in ('complete', 'failed'))
total_in_pipeline += in_pipe
source['pipeline'] = pipeline
sources.append(source)
# Check kiwix-serve health
kiwix_status = 'inactive'
try:
resp = urllib.request.urlopen("http://localhost:8430", timeout=3)
if resp.status == 200:
kiwix_status = 'active'
except Exception:
pass
return {
'sources': sources,
'kiwix_serve': {'status': kiwix_status, 'url': 'https://wiki.echo6.co'},
'totals': {
'sources': len(sources),
'articles': total_articles,
'processed': total_processed,
'in_pipeline': total_in_pipeline,
}
}
# ── Metrics API ──
@app.route('/api/metrics/history')