diff --git a/lib/api.py b/lib/api.py index 757ebf4..980578b 100644 --- a/lib/api.py +++ b/lib/api.py @@ -35,12 +35,15 @@ _cache = { 'qdrant_scroll': None, 'qdrant_scroll_ts': 0, 'quick_stats': None, + 'kiwix_sources': None, } app = Flask(__name__, template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'), static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static')) +app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB + # ── Navigation Constants ── KNOWLEDGE_SUBNAV = [ @@ -56,6 +59,8 @@ PEERTUBE_SUBNAV = [ {'href': '/peertube/channels', 'label': 'Channels'}, ] + +KIWIX_SUBNAV = [] # Single-page, no subnav needed SETTINGS_SUBNAV = [ {'href': '/settings/keys', 'label': 'API Keys'}, {'href': '/settings/cookies', 'label': 'YouTube Cookies'}, @@ -908,6 +913,7 @@ def _build_knowledge_stats(): c.source, CASE WHEN c.source = 'stream.echo6.co' THEN 'transcript' + WHEN c.source = 'kiwix' THEN 'wiki' WHEN c.path LIKE 'http%' THEN 'web' ELSE 'pdf' END as type, @@ -967,6 +973,7 @@ def _build_knowledge_stats(): d.status, d.concepts_extracted, d.vectors_inserted, CASE WHEN c.source = 'stream.echo6.co' THEN 'transcript' + WHEN c.source = 'kiwix' THEN 'wiki' WHEN d.path LIKE 'http%' THEN 'web' ELSE 'pdf' END as type @@ -1072,6 +1079,12 @@ def start_cache_warmer(stop_event=None): except Exception as e: logger.warning(f" Quick stats warm-up failed: {e}") + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + logger.info(" Kiwix sources cached") + except Exception as e: + logger.warning(f" Kiwix sources warm-up failed: {e}") + logger.info("Cache warmer ready — all data pre-loaded") # Continuous refresh loop @@ -1098,6 +1111,10 @@ def start_cache_warmer(stop_event=None): _cache['quick_stats'] = _build_quick_stats() except Exception: pass + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + except Exception: + pass # PeerTube dashboard: every 30s (cycle 2, offset) if cycle % 2 == 1: @@ -1930,6 +1947,297 @@ def api_peertube_dashboard(): return jsonify(_cache['pt_dashboard']) + +# ── Kiwix Dashboard ── + +@app.route('/kiwix') +def kiwix_dashboard(): + return render_template('kiwix/dashboard.html', + domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix') + + +@app.route('/api/kiwix/sources') +def api_kiwix_sources(): + """Serve pre-cached Kiwix sources data (never blocks).""" + if _cache['kiwix_sources'] is None: + return jsonify({'error': 'Warming up, try again in a few seconds'}), 503 + return jsonify(_cache['kiwix_sources']) + + +@app.route('/api/kiwix/toggle-ingest/', methods=['POST']) +def api_kiwix_toggle_ingest(source_id): + """Toggle ingest_enabled on a ZIM source.""" + db = StatusDB() + conn = db._get_conn() + row = conn.execute("SELECT id, status, ingest_enabled FROM zim_sources WHERE id = ?", (source_id,)).fetchone() + if not row: + return jsonify({'error': 'Source not found'}), 404 + + data = request.get_json(silent=True) or {} + new_val = 1 if data.get('enabled', not row['ingest_enabled']) else 0 + conn.execute("UPDATE zim_sources SET ingest_enabled = ? WHERE id = ?", (new_val, source_id)) + conn.commit() + + # If toggling ON and source is eligible, spawn ingest in background + if new_val == 1 and row['status'] == 'detected': + _spawn_zim_ingest(source_id) + + return jsonify({'ok': True, 'ingest_enabled': new_val}) + + +@app.route('/api/kiwix/trigger-ingest/', methods=['POST']) +def api_kiwix_trigger_ingest(source_id): + """Explicit one-shot ingest trigger.""" + db = StatusDB() + conn = db._get_conn() + row = conn.execute("SELECT id FROM zim_sources WHERE id = ?", (source_id,)).fetchone() + if not row: + return jsonify({'error': 'Source not found'}), 404 + + _spawn_zim_ingest(source_id) + return jsonify({'ok': True}) + + +@app.route('/api/kiwix/upload', methods=['POST']) +def api_kiwix_upload(): + """Accept ZIM file upload, register with kiwix-serve, scan.""" + import subprocess + if 'file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + f = request.files['file'] + if not f.filename or not f.filename.endswith('.zim'): + return jsonify({'error': 'File must be a .zim file'}), 400 + + filename = secure_filename(f.filename) + dest = os.path.join('/mnt/kiwix', filename) + tmp_dest = dest + '.tmp' + + try: + f.save(tmp_dest) + os.rename(tmp_dest, dest) + except Exception as e: + if os.path.exists(tmp_dest): + os.remove(tmp_dest) + return jsonify({'error': f'Save failed: {e}'}), 500 + + # Register with kiwix-serve library + try: + subprocess.run( + ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'add', dest], + capture_output=True, text=True, timeout=30 + ) + except Exception as e: + logger.warning(f"kiwix-manage add failed: {e}") + + # Scan for new entry + try: + from .zim_monitor import scan_zims + scan_zims() + except Exception as e: + logger.warning(f"scan_zims after upload failed: {e}") + + # Refresh cache + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + except Exception: + pass + + return jsonify({'ok': True, 'filename': filename}) + + + +@app.route('/api/kiwix/remove/', methods=['POST']) +def api_kiwix_remove(source_id): + """Remove a ZIM source: delete vectors, DB records, library entry, and file.""" + import subprocess + import requests as req + + db = StatusDB() + conn = db._get_conn() + row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone() + if not row: + return jsonify({'error': 'Source not found'}), 404 + + zim_source = dict(row) + zim_filename = zim_source['zim_filename'] + zim_path = zim_source['zim_path'] + zim_title = zim_source.get('title', zim_filename) + results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False} + + # Step 1: Find all document hashes for this ZIM source + doc_hashes = [r['hash'] for r in conn.execute( + "SELECT c.hash FROM catalogue c WHERE c.source = 'kiwix' AND c.category = ?", + (zim_title,) + ).fetchall()] + + # Step 2: Delete vectors from Qdrant + if doc_hashes: + config = get_config() + qdrant_host = config.get('vector_db', {}).get('host', '100.64.0.14') + qdrant_port = config.get('vector_db', {}).get('port', 6333) + collection = config.get('vector_db', {}).get('collection', 'recon_knowledge') + + # Delete in batches of 100 hashes + for i in range(0, len(doc_hashes), 100): + batch = doc_hashes[i:i+100] + try: + resp = req.post( + f"http://{qdrant_host}:{qdrant_port}/collections/{collection}/points/delete", + json={ + "filter": { + "must": [{ + "key": "doc_hash", + "match": {"any": batch} + }] + } + }, + timeout=30 + ) + if resp.status_code == 200: + results['vectors_deleted'] += len(batch) + except Exception as e: + logger.warning(f"Qdrant delete batch failed: {e}") + + # Step 3: Delete DB records + for h in doc_hashes: + # Delete processing directory if it exists + text_dir_row = conn.execute("SELECT text_dir FROM documents WHERE hash = ?", (h,)).fetchone() + if text_dir_row and text_dir_row['text_dir']: + try: + import shutil + shutil.rmtree(text_dir_row['text_dir'], ignore_errors=True) + except Exception: + pass + conn.execute("DELETE FROM documents WHERE hash = ?", (h,)) + conn.execute("DELETE FROM catalogue WHERE hash = ?", (h,)) + results['docs_deleted'] = len(doc_hashes) + + # Delete zim_articles records + conn.execute("DELETE FROM zim_articles WHERE zim_source_id = ?", (source_id,)) + + # Delete zim_sources record + conn.execute("DELETE FROM zim_sources WHERE id = ?", (source_id,)) + conn.commit() + + # Step 4: Remove from kiwix-serve library + try: + # Get the book ID from library.xml + subprocess.run( + ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')], + capture_output=True, text=True, timeout=10 + ) + except Exception as e: + logger.warning(f"kiwix-manage remove failed: {e}") + + # Step 5: Delete the ZIM file + if os.path.isfile(zim_path): + try: + os.remove(zim_path) + results['file_deleted'] = True + except Exception as e: + logger.warning(f"ZIM file delete failed: {e}") + results['file_deleted'] = False + + # Refresh cache + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + except Exception: + pass + + logger.info(f"Removed ZIM source '{zim_title}': {results}") + return jsonify({'ok': True, 'results': results}) + + +def _spawn_zim_ingest(source_id): + """Start ZIM ingestion in a background thread.""" + def _run(): + try: + from .processors.zim_processor import ingest_zim + config = get_config() + db = StatusDB() + logger.info(f"Starting ZIM ingest for source {source_id}") + result = ingest_zim(source_id, db, config) + logger.info(f"ZIM ingest complete for source {source_id}: {result}") + # Refresh cache after completion + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + except Exception: + pass + except Exception as e: + logger.error(f"ZIM ingest failed for source {source_id}: {e}") + + t = threading.Thread(target=_run, daemon=True, name=f'zim-ingest-{source_id}') + t.start() + + +def _build_kiwix_sources(): + """Build Kiwix sources data for the dashboard cache.""" + import urllib.request + + db = StatusDB() + conn = db._get_conn() + + # Get all ZIM sources + rows = conn.execute(""" + SELECT id, zim_filename, title, description, language, category, + article_count, status, processed_count, skipped_count, error_count, + ingest_enabled, detected_at, started_at, completed_at + FROM zim_sources + ORDER BY detected_at DESC + """).fetchall() + + sources = [] + total_articles = 0 + total_processed = 0 + total_in_pipeline = 0 + + for r in rows: + source = dict(r) + total_articles += r['article_count'] or 0 + total_processed += r['processed_count'] or 0 + + # Get pipeline stats for this source's documents + pipeline = {} + try: + pipe_rows = conn.execute(""" + SELECT d.status, COUNT(*) as cnt + FROM documents d + JOIN catalogue c ON d.hash = c.hash + WHERE c.source = 'kiwix' + GROUP BY d.status + """).fetchall() + for pr in pipe_rows: + pipeline[pr['status']] = pr['cnt'] + except Exception: + pass + + in_pipe = sum(v for k, v in pipeline.items() if k not in ('complete', 'failed')) + total_in_pipeline += in_pipe + source['pipeline'] = pipeline + sources.append(source) + + # Check kiwix-serve health + kiwix_status = 'inactive' + try: + resp = urllib.request.urlopen("http://localhost:8430", timeout=3) + if resp.status == 200: + kiwix_status = 'active' + except Exception: + pass + + return { + 'sources': sources, + 'kiwix_serve': {'status': kiwix_status, 'url': 'https://wiki.echo6.co'}, + 'totals': { + 'sources': len(sources), + 'articles': total_articles, + 'processed': total_processed, + 'in_pipeline': total_in_pipeline, + } + } + + # ── Metrics API ── @app.route('/api/metrics/history') diff --git a/lib/embedder.py b/lib/embedder.py index 35fcb58..034624a 100644 --- a/lib/embedder.py +++ b/lib/embedder.py @@ -10,6 +10,7 @@ Dependencies: requests, qdrant-client Config: embedding, vector_db, processing.embed_workers """ import json +import re import os import time import traceback @@ -290,7 +291,17 @@ def embed_single(file_hash, db, config): page_timestamps = meta['page_timestamps'] except Exception: pass - if doc.get('path'): + # For ZIM articles, build wiki.echo6.co URL from meta.json + if source_type == 'zim' and meta.get('article_path'): + from urllib.parse import quote as url_quote + zim_name = meta.get('zim_name', '') + if not zim_name: + # Derive from zim_file: strip flavor/date suffix + zf = meta.get('zim_file', '') + zim_name = re.sub(r'_(?:maxi|mini|nopic)_[\d-]+\.zim$', '', zf) + article_path = url_quote(meta['article_path'], safe='/:@!$&()*+,;=-._~') + download_url = f'https://wiki.echo6.co/{zim_name}/{article_path}' + elif doc.get('path'): download_url = generate_download_url( doc['path'], config.get('library_root', '/mnt/library') ) diff --git a/static/css/recon.css b/static/css/recon.css index 95aed52..9289f93 100644 --- a/static/css/recon.css +++ b/static/css/recon.css @@ -211,6 +211,7 @@ tr:hover { background: var(--bg-secondary); } .badge-web { background: #1e3a5f; color: #60a5fa; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } .badge-pdf { background: #2d5a2d; color: #4ade80; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } .badge-transcript { background: #3b1f5e; color: #c084fc; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } +.badge-wiki { background: #1f4a3b; color: #34d399; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } /* ── Trend indicators ── */ .trend { font-size: 11px; margin-left: 6px; } @@ -315,3 +316,16 @@ tr:hover { background: var(--bg-secondary); } .errors-panel.has-errors { display: block; } .errors-panel summary { color: var(--red); cursor: pointer; font-size: 13px; margin-bottom: 8px; } .errors-panel .error-line { color: var(--text-muted); font-size: 11px; padding: 2px 0; border-bottom: 1px solid var(--border); } + +/* ── Toggle switch ── */ +.toggle-switch { position: relative; display: inline-block; width: 40px; height: 20px; } +.toggle-switch input { opacity: 0; width: 0; height: 0; } +.toggle-slider { position: absolute; cursor: pointer; inset: 0; background: #333; border-radius: 20px; transition: 0.3s; } +.toggle-slider:before { content: ''; position: absolute; height: 16px; width: 16px; left: 2px; bottom: 2px; background: #888; border-radius: 50%; transition: 0.3s; } +.toggle-switch input:checked + .toggle-slider { background: #1a4a2e; } +.toggle-switch input:checked + .toggle-slider:before { transform: translateX(20px); background: #00ff41; } + +/* ── Kiwix status badges ── */ +.badge-complete { background: #1a4a2e; color: #00ff41; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } +.badge-ingesting { background: #1a3a5a; color: #0ea5e9; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } +.badge-detected { background: #333; color: #888; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } diff --git a/static/js/dashboard.js b/static/js/dashboard.js index 254d92a..0bd0b39 100644 --- a/static/js/dashboard.js +++ b/static/js/dashboard.js @@ -88,7 +88,7 @@ var pipeCount = s.in_pipeline || 0; totalCat += catCount; totalComp += compCount; totalPipe += pipeCount; totalConcepts += s.concepts; totalVectors += s.vectors; - var badge = s.type === 'transcript' ? 'TRANSCRIPT' : s.type === 'web' ? 'WEB' : 'PDF'; + var badge = s.type === 'transcript' ? 'TRANSCRIPT' : s.type === 'web' ? 'WEB' : s.type === 'wiki' ? 'WIKI' : 'PDF'; var compPct = catCount > 0 ? (compCount / catCount * 100) : 0; var pipePct = catCount > 0 ? (pipeCount / catCount * 100) : 0; var compColor = compPct >= 100 ? '#00ff41' : compPct > 0 ? '#ffa500' : '#666'; @@ -185,7 +185,7 @@ rtb.innerHTML = 'None yet'; } else { rtb.innerHTML = data.recent_complete.map(function(r) { - var badge = r.type === 'transcript' ? 'TRANSCRIPT' : r.type === 'web' ? 'WEB' : 'PDF'; + var badge = r.type === 'transcript' ? 'TRANSCRIPT' : r.type === 'web' ? 'WEB' : r.type === 'wiki' ? 'WIKI' : 'PDF'; return '' + r.title + '' + badge + '' + r.concepts + '' + r.vectors + ''; }).join(''); diff --git a/static/js/kiwix.js b/static/js/kiwix.js new file mode 100644 index 0000000..aab8552 --- /dev/null +++ b/static/js/kiwix.js @@ -0,0 +1,136 @@ +/* RECON Kiwix Dashboard JS */ +(function() { + 'use strict'; + + function loadKiwixDashboard() { + return RECON.fetchJSON('/api/kiwix/sources').then(function(data) { + // Update stat cards + var t = data.totals || {}; + RECON.set('kx-sources', RECON.fmt(t.sources)); + RECON.set('kx-articles', RECON.fmt(t.articles)); + RECON.set('kx-processed', RECON.fmt(t.processed)); + RECON.set('kx-pipeline', RECON.fmt(t.in_pipeline)); + + // Kiwix-serve status dot + var ks = data.kiwix_serve || {}; + var dot = document.getElementById('svc-kiwix-serve'); + dot.className = 'svc-dot ' + (ks.status === 'active' ? 'active' : 'inactive'); + + // ZIM table + var sources = data.sources || []; + var html = ''; + sources.forEach(function(s) { + var pctDone = s.article_count > 0 ? (s.processed_count / s.article_count * 100).toFixed(1) : 0; + var statusBadge = s.status === 'complete' ? 'COMPLETE' : + s.status === 'ingesting' ? 'INGESTING' : + 'DETECTED'; + // Derive browse URL from zim_filename + var zimName = s.zim_filename.replace(/_(?:maxi|mini|nopic)_[\d-]+\.zim$/, ''); + var browseUrl = 'https://wiki.echo6.co/' + zimName + '/'; + // Toggle switch + var checked = s.ingest_enabled ? ' checked' : ''; + var toggle = ''; + + html += '' + + '' + (s.title || s.zim_filename) + '' + + '
' + s.zim_filename + '
' + + '' + (s.language || '\u2014') + '' + + '' + RECON.fmt(s.article_count) + '' + + '' + RECON.fmt(s.processed_count) + ' / ' + RECON.fmt(s.article_count) + + ' (' + pctDone + '%)' + + '' + statusBadge + '' + + '' + toggle + '' + + 'Browse' + + '' + + ''; + }); + if (!html) html = 'No ZIM sources detected'; + RECON.setHTML('kx-table-body', html); + }).catch(function(err) { + console.error('Kiwix dashboard error:', err); + }); + } + + function toggleIngest(id, enabled) { + RECON.postJSON('/api/kiwix/toggle-ingest/' + id, {enabled: enabled}).then(function(data) { + if (data.ok) loadKiwixDashboard(); + }); + } + + function removeSource(id, title) { + if (!confirm('Remove "' + title + '"?\n\nThis will delete the ZIM file, all ingested documents, and associated vectors from Qdrant. This cannot be undone.')) return; + RECON.postJSON('/api/kiwix/remove/' + id).then(function(data) { + if (data.ok) { + var r = data.results || {}; + alert('Removed: ' + r.docs_deleted + ' docs, ~' + r.vectors_deleted + ' vector batches deleted, file ' + (r.file_deleted ? 'deleted' : 'not found')); + loadKiwixDashboard(); + } + }); + } + + function triggerIngest(id) { + RECON.postJSON('/api/kiwix/trigger-ingest/' + id).then(function(data) { + if (data.ok) loadKiwixDashboard(); + }); + } + + function uploadZim() { + var input = document.getElementById('kx-file-input'); + var file = input.files[0]; + if (!file) return; + + var statusEl = document.getElementById('kx-upload-status'); + var progressDiv = document.getElementById('kx-upload-progress'); + var progressBar = document.getElementById('kx-progress-bar'); + var progressText = document.getElementById('kx-progress-text'); + + statusEl.textContent = 'Uploading ' + file.name + '...'; + progressDiv.style.display = 'block'; + + var formData = new FormData(); + formData.append('file', file); + + var xhr = new XMLHttpRequest(); + xhr.open('POST', '/api/kiwix/upload', true); + + xhr.upload.onprogress = function(e) { + if (e.lengthComputable) { + var pct = (e.loaded / e.total * 100).toFixed(1); + progressBar.style.width = pct + '%'; + progressText.textContent = RECON.fmtBytes(e.loaded) + ' / ' + RECON.fmtBytes(e.total) + ' (' + pct + '%)'; + } + }; + + xhr.onload = function() { + if (xhr.status === 200) { + var resp = JSON.parse(xhr.responseText); + statusEl.textContent = resp.ok ? 'Upload complete: ' + resp.filename : 'Error: ' + (resp.error || 'Unknown'); + progressBar.style.width = '100%'; + progressBar.style.background = resp.ok ? '#16a34a' : '#dc2626'; + if (resp.ok) loadKiwixDashboard(); + } else { + statusEl.textContent = 'Upload failed (HTTP ' + xhr.status + ')'; + progressBar.style.background = '#dc2626'; + } + input.value = ''; + }; + + xhr.onerror = function() { + statusEl.textContent = 'Upload failed (network error)'; + progressBar.style.background = '#dc2626'; + input.value = ''; + }; + + xhr.send(formData); + } + + // Expose for inline onclick + window.KIWIX = { toggleIngest: toggleIngest, triggerIngest: triggerIngest, remove: removeSource }; + + document.addEventListener('DOMContentLoaded', function() { + RECON.startRefresh(loadKiwixDashboard, 30000); + document.getElementById('kx-file-input').addEventListener('change', uploadZim); + }); +})(); diff --git a/templates/base.html b/templates/base.html index 09db6d8..49b1a21 100644 --- a/templates/base.html +++ b/templates/base.html @@ -19,6 +19,7 @@ diff --git a/templates/kiwix/dashboard.html b/templates/kiwix/dashboard.html new file mode 100644 index 0000000..72bbed4 --- /dev/null +++ b/templates/kiwix/dashboard.html @@ -0,0 +1,48 @@ +{% extends "base.html" %} +{% block content %} +
+ +
+
ZIM Sources
+
Total Articles
+
Processed
+
In Pipeline
+
+ + +
+
Kiwix-Serve
+ +
+ + +
+

ZIM Library

+ + + + + + + +
TitleLanguageArticlesProgressStatusIngestBrowse
Loading...
+
+ + +
+

Upload ZIM File

+
+ + + +
+ +
+
+{% endblock %} +{% block scripts %} + +{% endblock %}