""" RECON Web Dashboard & API Flask app on port 8420. Jinja2 templates + static files. Pages: Knowledge (Dashboard, Catalogue, Upload, Web Ingest, Failures), PeerTube (Dashboard, Channels), Search, Settings (Keys, Cookies, VPN, Health). API endpoints for all pipeline operations including crawl, ingest, and search. Dependencies: Flask, qdrant-client, requests Config: web, vector_db, embedding sections of config.yaml """ import glob import json import threading import os import shutil import tempfile import requests as http_requests from flask import Flask, request, jsonify, redirect, render_template from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue from werkzeug.utils import secure_filename from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging from .status import StatusDB logger = setup_logging('recon.api') # ── Background cache warmer ── # All expensive queries run proactively so API endpoints never block. _cache = { 'knowledge_stats': None, 'pt_dashboard': None, 'qdrant_scroll': None, 'qdrant_scroll_ts': 0, 'quick_stats': None, 'kiwix_sources': None, } app = Flask(__name__, template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'), static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static')) app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB # ── Navigation Constants ── KNOWLEDGE_SUBNAV = [ {'href': '/', 'label': 'Dashboard'}, {'href': '/catalogue', 'label': 'Catalogue'}, {'href': '/upload', 'label': 'Upload'}, {'href': '/web-ingest', 'label': 'Web Ingest'}, {'href': '/failures', 'label': 'Failures'}, ] PEERTUBE_SUBNAV = [ {'href': '/peertube', 'label': 'Dashboard'}, {'href': '/peertube/channels', 'label': 'Channels'}, ] KIWIX_SUBNAV = [ {'href': '/kiwix', 'label': 'Library'}, {'href': '/kiwix/scraper', 'label': 'Scraper'}, ] SETTINGS_SUBNAV = [ {'href': '/settings/keys', 'label': 'API Keys'}, {'href': '/settings/cookies', 'label': 'YouTube Cookies'}, {'href': '/settings/vpn', 'label': 'NordVPN'}, {'href': '/settings/health', 'label': 'Service Health'}, ] def _format_source_citation(payload): """Format a human-readable citation from a search result payload.""" book = payload.get('book_title', '') if not book: book = clean_filename_to_title(payload.get('filename', 'Unknown')) page = payload.get('page_ref', '') if page: page_str = str(page) if not page_str.startswith('p'): page_str = f"p. {page_str}" return f"{book}, {page_str}" return book ALLOWED_EXTENSIONS = {'.pdf', '.txt', '.epub', '.doc', '.docx', '.mobi'} HOPPER_ROUTING = { '.pdf': '/opt/recon/data/acquired/pdf/', '.txt': '/opt/recon/data/acquired/text/', '.epub': '/opt/recon/data/acquired/pdf/', '.doc': '/opt/recon/data/acquired/pdf/', '.docx': '/opt/recon/data/acquired/pdf/', '.mobi': '/opt/recon/data/acquired/pdf/', } def _process_upload(filepath, original_filename, ext, category, config, db): """Process an upload: hash, dedup, drop into hopper for dispatcher pickup.""" file_hash = content_hash(filepath) conn = db._get_conn() existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone() if existing: raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}") # Also check if already sitting in a hopper dir awaiting dispatch for hopper in HOPPER_ROUTING.values(): if any(os.path.exists(os.path.join(hopper, file_hash + e)) for e in ALLOWED_EXTENSIONS): raise ValueError("Duplicate: file already queued for processing") hopper_dir = HOPPER_ROUTING.get(ext, '/opt/recon/data/acquired/pdf/') os.makedirs(hopper_dir, exist_ok=True) target_path = os.path.join(hopper_dir, file_hash + ext) meta_path = os.path.join(hopper_dir, file_hash + '.meta.json') stem = os.path.splitext(original_filename)[0] sidecar = { 'title': stem, 'source': 'dashboard_upload', 'source_type': ext.lstrip('.'), 'category': category, 'original_filename': original_filename, } # Write sidecar first (with .tmp safety), then content tmp_meta = meta_path + '.tmp' with open(tmp_meta, 'w', encoding='utf-8') as f: json.dump(sidecar, f, indent=2) os.rename(tmp_meta, meta_path) shutil.copy2(filepath, target_path) return { 'hash': file_hash, 'filename': original_filename, 'source_type': ext.lstrip('.'), 'status': 'queued', } # ── Page Routes ── @app.route('/') def dashboard(): return render_template('knowledge/dashboard.html', domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/') @app.route('/search') def search_page(): query = request.args.get('q', '') if not query: return render_template('search.html', domain='search', subnav=None, active_page='/search') config = get_config() limit = int(request.args.get('limit', 20)) source_filter = request.args.get('source_type', None) try: from .embedder import get_embedding_single query_vector = get_embedding_single(query, config) qdrant = QdrantClient( host=config['vector_db']['host'], port=config['vector_db']['port'], timeout=60 ) search_filter = None if source_filter: search_filter = Filter(must=[ FieldCondition(key="source_type", match=MatchValue(value=source_filter)) ]) results = qdrant.query_points( collection_name=config['vector_db']['collection'], query=query_vector, limit=limit, query_filter=search_filter ).points formatted = [] for r in results: p = r.payload raw_dom = p.get('domain', []) if isinstance(raw_dom, str): domains = [raw_dom] if raw_dom else [] elif isinstance(raw_dom, list): domains = raw_dom else: domains = [] formatted.append({ 'score': r.score, 'title': p.get('title', 'Untitled'), 'summary': p.get('summary', p.get('content', '')[:200]), 'citation': _format_source_citation(p), 'download_url': p.get('download_url', ''), 'source_type': p.get('source_type', 'document'), 'knowledge_type': p.get('knowledge_type', ''), 'complexity': p.get('complexity', ''), 'domains': domains, }) return render_template('search.html', domain='search', subnav=None, active_page='/search', query=query, results=formatted) except Exception as e: return render_template('search.html', domain='search', subnav=None, active_page='/search', query=query, error=str(e)) @app.route('/catalogue') def catalogue_page(): db = StatusDB() source = request.args.get('source', None) category = request.args.get('category', None) per_page = int(request.args.get('per_page', 50)) page = int(request.args.get('page', 1)) if page < 1: page = 1 offset = (page - 1) * per_page total_count = db.count_documents(source=source, category=category) total_pages = max(1, (total_count + per_page - 1) // per_page) if page > total_pages: page = total_pages offset = (page - 1) * per_page docs = db.get_all_documents(source=source, category=category, limit=per_page, offset=offset) sources = db.get_sources() return render_template('knowledge/catalogue.html', domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/catalogue', docs=docs, sources=sources, current_source=source, page=page, per_page=per_page, total_pages=total_pages, total_count=total_count) @app.route('/upload') def upload_page(): db = StatusDB() config = get_config() upload_paths = config.get('upload_paths', {}) categories = sorted(k for k in upload_paths if k != 'default') db_sources = db.get_sources() for s in db_sources: if s not in categories: categories.append(s) options_html = ''.join(f'