"""
RECON Web Dashboard & API
Flask app on port 8420. Jinja2 templates + static files.
Pages: Knowledge (Dashboard, Catalogue, Upload, Web Ingest, Failures),
PeerTube (Dashboard, Channels), Search, Settings (Keys, Cookies, VPN, Health).
API endpoints for all pipeline operations including crawl, ingest, and search.
Dependencies: Flask, qdrant-client, requests
Config: web, vector_db, embedding sections of config.yaml
"""
import json
import threading
import os
import shutil
import tempfile
import requests as http_requests
from flask import Flask, request, jsonify, redirect, render_template
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
from werkzeug.utils import secure_filename
from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging
from .status import StatusDB
logger = setup_logging('recon.api')
# ── Background cache warmer ──
# All expensive queries run proactively so API endpoints never block.
_cache = {
'knowledge_stats': None,
'pt_dashboard': None,
'qdrant_scroll': None,
'qdrant_scroll_ts': 0,
'quick_stats': None,
}
app = Flask(__name__,
template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'),
static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static'))
# ── Navigation Constants ──
KNOWLEDGE_SUBNAV = [
{'href': '/', 'label': 'Dashboard'},
{'href': '/catalogue', 'label': 'Catalogue'},
{'href': '/upload', 'label': 'Upload'},
{'href': '/web-ingest', 'label': 'Web Ingest'},
{'href': '/failures', 'label': 'Failures'},
]
PEERTUBE_SUBNAV = [
{'href': '/peertube', 'label': 'Dashboard'},
{'href': '/peertube/channels', 'label': 'Channels'},
]
SETTINGS_SUBNAV = [
{'href': '/settings/keys', 'label': 'API Keys'},
{'href': '/settings/cookies', 'label': 'YouTube Cookies'},
{'href': '/settings/vpn', 'label': 'NordVPN'},
{'href': '/settings/health', 'label': 'Service Health'},
]
def _format_source_citation(payload):
"""Format a human-readable citation from a search result payload."""
book = payload.get('book_title', '')
if not book:
book = clean_filename_to_title(payload.get('filename', 'Unknown'))
page = payload.get('page_ref', '')
if page:
page_str = str(page)
if not page_str.startswith('p'):
page_str = f"p. {page_str}"
return f"{book}, {page_str}"
return book
def _resolve_upload_path(category, config):
"""Resolve the target directory for an upload given a category name."""
upload_paths = config.get('upload_paths', {})
library_root = config['library_root']
if category in upload_paths:
return upload_paths[category]
default_path = upload_paths.get('default', library_root)
safe_category = secure_filename(category) if category else ''
if safe_category:
return os.path.join(default_path, safe_category)
return default_path
def _process_upload(filepath, original_filename, category, config, db):
"""Process a single PDF upload: hash, dedup, copy to library, catalogue, queue."""
library_root = config['library_root']
file_hash = content_hash(filepath)
conn = db._get_conn()
existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone()
if existing:
raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}")
target_dir = _resolve_upload_path(category, config)
os.makedirs(target_dir, exist_ok=True)
safe_name = secure_filename(original_filename)
if not safe_name:
safe_name = f"{file_hash}.pdf"
target_path = os.path.join(target_dir, safe_name)
if os.path.exists(target_path):
base, ext = os.path.splitext(safe_name)
target_path = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}")
shutil.copy2(filepath, target_path)
size = os.path.getsize(target_path)
source, derived_category = derive_source_and_category(target_path, library_root)
db.add_to_catalogue(file_hash, safe_name, target_path, size, source, derived_category)
db.queue_document(file_hash)
return {
'hash': file_hash,
'filename': safe_name,
'category': derived_category,
'source': source,
'path': target_path,
'size_bytes': size,
'status': 'queued'
}
# ── Page Routes ──
@app.route('/')
def dashboard():
return render_template('knowledge/dashboard.html',
domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/')
@app.route('/search')
def search_page():
query = request.args.get('q', '')
if not query:
return render_template('search.html', domain='search', subnav=None, active_page='/search')
config = get_config()
limit = int(request.args.get('limit', 20))
source_filter = request.args.get('source_type', None)
try:
from .embedder import get_embedding_single
query_vector = get_embedding_single(query, config)
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=60
)
search_filter = None
if source_filter:
search_filter = Filter(must=[
FieldCondition(key="source_type", match=MatchValue(value=source_filter))
])
results = qdrant.query_points(
collection_name=config['vector_db']['collection'],
query=query_vector,
limit=limit,
query_filter=search_filter
).points
formatted = []
for r in results:
p = r.payload
raw_dom = p.get('domain', [])
if isinstance(raw_dom, str):
domains = [raw_dom] if raw_dom else []
elif isinstance(raw_dom, list):
domains = raw_dom
else:
domains = []
formatted.append({
'score': r.score,
'title': p.get('title', 'Untitled'),
'summary': p.get('summary', p.get('content', '')[:200]),
'citation': _format_source_citation(p),
'download_url': p.get('download_url', ''),
'source_type': p.get('source_type', 'document'),
'knowledge_type': p.get('knowledge_type', ''),
'complexity': p.get('complexity', ''),
'domains': domains,
})
return render_template('search.html', domain='search', subnav=None, active_page='/search',
query=query, results=formatted)
except Exception as e:
return render_template('search.html', domain='search', subnav=None, active_page='/search',
query=query, error=str(e))
@app.route('/catalogue')
def catalogue_page():
db = StatusDB()
source = request.args.get('source', None)
category = request.args.get('category', None)
per_page = int(request.args.get('per_page', 50))
page = int(request.args.get('page', 1))
if page < 1:
page = 1
offset = (page - 1) * per_page
total_count = db.count_documents(source=source, category=category)
total_pages = max(1, (total_count + per_page - 1) // per_page)
if page > total_pages:
page = total_pages
offset = (page - 1) * per_page
docs = db.get_all_documents(source=source, category=category, limit=per_page, offset=offset)
sources = db.get_sources()
return render_template('knowledge/catalogue.html',
domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/catalogue',
docs=docs, sources=sources, current_source=source,
page=page, per_page=per_page, total_pages=total_pages, total_count=total_count)
@app.route('/upload')
def upload_page():
db = StatusDB()
config = get_config()
upload_paths = config.get('upload_paths', {})
categories = sorted(k for k in upload_paths if k != 'default')
db_sources = db.get_sources()
for s in db_sources:
if s not in categories:
categories.append(s)
options_html = ''.join(f'