""" RECON Web Dashboard & API Flask app on port 8420. Jinja2 templates + static files. Pages: Knowledge (Dashboard, Catalogue, Upload, Web Ingest, Failures), PeerTube (Dashboard, Channels), Search, Settings (Keys, Cookies, VPN, Health). API endpoints for all pipeline operations including crawl, ingest, and search. Dependencies: Flask, qdrant-client, requests Config: web, vector_db, embedding sections of config.yaml """ import json import threading import os import shutil import tempfile import requests as http_requests from flask import Flask, request, jsonify, redirect, render_template from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue from werkzeug.utils import secure_filename from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging from .status import StatusDB logger = setup_logging('recon.api') # ── Background cache warmer ── # All expensive queries run proactively so API endpoints never block. _cache = { 'knowledge_stats': None, 'pt_dashboard': None, 'qdrant_scroll': None, 'qdrant_scroll_ts': 0, 'quick_stats': None, } app = Flask(__name__, template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'), static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static')) # ── Navigation Constants ── KNOWLEDGE_SUBNAV = [ {'href': '/', 'label': 'Dashboard'}, {'href': '/catalogue', 'label': 'Catalogue'}, {'href': '/upload', 'label': 'Upload'}, {'href': '/web-ingest', 'label': 'Web Ingest'}, {'href': '/failures', 'label': 'Failures'}, ] PEERTUBE_SUBNAV = [ {'href': '/peertube', 'label': 'Dashboard'}, {'href': '/peertube/channels', 'label': 'Channels'}, ] SETTINGS_SUBNAV = [ {'href': '/settings/keys', 'label': 'API Keys'}, {'href': '/settings/cookies', 'label': 'YouTube Cookies'}, {'href': '/settings/vpn', 'label': 'NordVPN'}, {'href': '/settings/health', 'label': 'Service Health'}, ] def _format_source_citation(payload): """Format a human-readable citation from a search result payload.""" book = payload.get('book_title', '') if not book: book = clean_filename_to_title(payload.get('filename', 'Unknown')) page = payload.get('page_ref', '') if page: page_str = str(page) if not page_str.startswith('p'): page_str = f"p. {page_str}" return f"{book}, {page_str}" return book def _resolve_upload_path(category, config): """Resolve the target directory for an upload given a category name.""" upload_paths = config.get('upload_paths', {}) library_root = config['library_root'] if category in upload_paths: return upload_paths[category] default_path = upload_paths.get('default', library_root) safe_category = secure_filename(category) if category else '' if safe_category: return os.path.join(default_path, safe_category) return default_path def _process_upload(filepath, original_filename, category, config, db): """Process a single PDF upload: hash, dedup, copy to library, catalogue, queue.""" library_root = config['library_root'] file_hash = content_hash(filepath) conn = db._get_conn() existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone() if existing: raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}") target_dir = _resolve_upload_path(category, config) os.makedirs(target_dir, exist_ok=True) safe_name = secure_filename(original_filename) if not safe_name: safe_name = f"{file_hash}.pdf" target_path = os.path.join(target_dir, safe_name) if os.path.exists(target_path): base, ext = os.path.splitext(safe_name) target_path = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}") shutil.copy2(filepath, target_path) size = os.path.getsize(target_path) source, derived_category = derive_source_and_category(target_path, library_root) db.add_to_catalogue(file_hash, safe_name, target_path, size, source, derived_category) db.queue_document(file_hash) return { 'hash': file_hash, 'filename': safe_name, 'category': derived_category, 'source': source, 'path': target_path, 'size_bytes': size, 'status': 'queued' } # ── Page Routes ── @app.route('/') def dashboard(): return render_template('knowledge/dashboard.html', domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/') @app.route('/search') def search_page(): query = request.args.get('q', '') if not query: return render_template('search.html', domain='search', subnav=None, active_page='/search') config = get_config() limit = int(request.args.get('limit', 20)) source_filter = request.args.get('source_type', None) try: from .embedder import get_embedding_single query_vector = get_embedding_single(query, config) qdrant = QdrantClient( host=config['vector_db']['host'], port=config['vector_db']['port'], timeout=60 ) search_filter = None if source_filter: search_filter = Filter(must=[ FieldCondition(key="source_type", match=MatchValue(value=source_filter)) ]) results = qdrant.query_points( collection_name=config['vector_db']['collection'], query=query_vector, limit=limit, query_filter=search_filter ).points formatted = [] for r in results: p = r.payload raw_dom = p.get('domain', []) if isinstance(raw_dom, str): domains = [raw_dom] if raw_dom else [] elif isinstance(raw_dom, list): domains = raw_dom else: domains = [] formatted.append({ 'score': r.score, 'title': p.get('title', 'Untitled'), 'summary': p.get('summary', p.get('content', '')[:200]), 'citation': _format_source_citation(p), 'download_url': p.get('download_url', ''), 'source_type': p.get('source_type', 'document'), 'knowledge_type': p.get('knowledge_type', ''), 'complexity': p.get('complexity', ''), 'domains': domains, }) return render_template('search.html', domain='search', subnav=None, active_page='/search', query=query, results=formatted) except Exception as e: return render_template('search.html', domain='search', subnav=None, active_page='/search', query=query, error=str(e)) @app.route('/catalogue') def catalogue_page(): db = StatusDB() source = request.args.get('source', None) category = request.args.get('category', None) per_page = int(request.args.get('per_page', 50)) page = int(request.args.get('page', 1)) if page < 1: page = 1 offset = (page - 1) * per_page total_count = db.count_documents(source=source, category=category) total_pages = max(1, (total_count + per_page - 1) // per_page) if page > total_pages: page = total_pages offset = (page - 1) * per_page docs = db.get_all_documents(source=source, category=category, limit=per_page, offset=offset) sources = db.get_sources() return render_template('knowledge/catalogue.html', domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/catalogue', docs=docs, sources=sources, current_source=source, page=page, per_page=per_page, total_pages=total_pages, total_count=total_count) @app.route('/upload') def upload_page(): db = StatusDB() config = get_config() upload_paths = config.get('upload_paths', {}) categories = sorted(k for k in upload_paths if k != 'default') db_sources = db.get_sources() for s in db_sources: if s not in categories: categories.append(s) options_html = ''.join(f'