"""
RECON Web Dashboard & API
Flask app on port 8420. Jinja2 templates + static files.
Pages: Knowledge (Dashboard, Catalogue, Upload, Web Ingest, Failures),
PeerTube (Dashboard, Channels), Search, Settings (Keys, Cookies, VPN, Health).
API endpoints for all pipeline operations including crawl, ingest, and search.
Dependencies: Flask, qdrant-client, requests
Config: web, vector_db, embedding sections of config.yaml
"""
import glob
import json
import threading
import os
import shutil
import tempfile
import requests as http_requests
from flask import Flask, request, jsonify, redirect, render_template
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue
from werkzeug.utils import secure_filename
from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging
from .status import StatusDB
logger = setup_logging('recon.api')
# ── Background cache warmer ──
# All expensive queries run proactively so API endpoints never block.
_cache = {
'knowledge_stats': None,
'pt_dashboard': None,
'qdrant_scroll': None,
'qdrant_scroll_ts': 0,
'quick_stats': None,
'kiwix_sources': None,
}
app = Flask(__name__,
template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'),
static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static'))
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
# ── Address Book Blueprint ──
from .address_book_api import address_book_bp
app.register_blueprint(address_book_bp)
# ── Navigation Constants ──
KNOWLEDGE_SUBNAV = [
{'href': '/', 'label': 'Dashboard'},
{'href': '/catalogue', 'label': 'Catalogue'},
{'href': '/upload', 'label': 'Upload'},
{'href': '/web-ingest', 'label': 'Web Ingest'},
{'href': '/failures', 'label': 'Failures'},
]
PEERTUBE_SUBNAV = [
{'href': '/peertube', 'label': 'Dashboard'},
{'href': '/peertube/channels', 'label': 'Channels'},
]
KIWIX_SUBNAV = [] # Single-page, no subnav needed
SETTINGS_SUBNAV = [
{'href': '/settings/keys', 'label': 'API Keys'},
{'href': '/settings/cookies', 'label': 'YouTube Cookies'},
{'href': '/settings/vpn', 'label': 'NordVPN'},
{'href': '/settings/health', 'label': 'Service Health'},
]
def _format_source_citation(payload):
"""Format a human-readable citation from a search result payload."""
book = payload.get('book_title', '')
if not book:
book = clean_filename_to_title(payload.get('filename', 'Unknown'))
page = payload.get('page_ref', '')
if page:
page_str = str(page)
if not page_str.startswith('p'):
page_str = f"p. {page_str}"
return f"{book}, {page_str}"
return book
ALLOWED_EXTENSIONS = {'.pdf', '.txt', '.epub', '.doc', '.docx', '.mobi'}
HOPPER_ROUTING = {
'.pdf': '/opt/recon/data/acquired/pdf/',
'.txt': '/opt/recon/data/acquired/text/',
'.epub': '/opt/recon/data/acquired/pdf/',
'.doc': '/opt/recon/data/acquired/pdf/',
'.docx': '/opt/recon/data/acquired/pdf/',
'.mobi': '/opt/recon/data/acquired/pdf/',
}
def _process_upload(filepath, original_filename, ext, category, config, db):
"""Process an upload: hash, dedup, drop into hopper for dispatcher pickup."""
file_hash = content_hash(filepath)
conn = db._get_conn()
existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone()
if existing:
raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}")
# Also check if already sitting in a hopper dir awaiting dispatch
for hopper in HOPPER_ROUTING.values():
if any(os.path.exists(os.path.join(hopper, file_hash + e)) for e in ALLOWED_EXTENSIONS):
raise ValueError("Duplicate: file already queued for processing")
hopper_dir = HOPPER_ROUTING.get(ext, '/opt/recon/data/acquired/pdf/')
os.makedirs(hopper_dir, exist_ok=True)
target_path = os.path.join(hopper_dir, file_hash + ext)
meta_path = os.path.join(hopper_dir, file_hash + '.meta.json')
stem = os.path.splitext(original_filename)[0]
sidecar = {
'title': stem,
'source': 'dashboard_upload',
'source_type': ext.lstrip('.'),
'category': category,
'original_filename': original_filename,
}
# Write sidecar first (with .tmp safety), then content
tmp_meta = meta_path + '.tmp'
with open(tmp_meta, 'w', encoding='utf-8') as f:
json.dump(sidecar, f, indent=2)
os.rename(tmp_meta, meta_path)
shutil.copy2(filepath, target_path)
return {
'hash': file_hash,
'filename': original_filename,
'source_type': ext.lstrip('.'),
'status': 'queued',
}
# ── Page Routes ──
@app.route('/')
def dashboard():
return render_template('knowledge/dashboard.html',
domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/')
@app.route('/search')
def search_page():
query = request.args.get('q', '')
if not query:
return render_template('search.html', domain='search', subnav=None, active_page='/search')
config = get_config()
limit = int(request.args.get('limit', 20))
source_filter = request.args.get('source_type', None)
try:
from .embedder import get_embedding_single
query_vector = get_embedding_single(query, config)
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=60
)
search_filter = None
if source_filter:
search_filter = Filter(must=[
FieldCondition(key="source_type", match=MatchValue(value=source_filter))
])
results = qdrant.query_points(
collection_name=config['vector_db']['collection'],
query=query_vector,
limit=limit,
query_filter=search_filter
).points
formatted = []
for r in results:
p = r.payload
raw_dom = p.get('domain', [])
if isinstance(raw_dom, str):
domains = [raw_dom] if raw_dom else []
elif isinstance(raw_dom, list):
domains = raw_dom
else:
domains = []
formatted.append({
'score': r.score,
'title': p.get('title', 'Untitled'),
'summary': p.get('summary', p.get('content', '')[:200]),
'citation': _format_source_citation(p),
'download_url': p.get('download_url', ''),
'source_type': p.get('source_type', 'document'),
'knowledge_type': p.get('knowledge_type', ''),
'complexity': p.get('complexity', ''),
'domains': domains,
})
return render_template('search.html', domain='search', subnav=None, active_page='/search',
query=query, results=formatted)
except Exception as e:
return render_template('search.html', domain='search', subnav=None, active_page='/search',
query=query, error=str(e))
@app.route('/catalogue')
def catalogue_page():
db = StatusDB()
source = request.args.get('source', None)
category = request.args.get('category', None)
per_page = int(request.args.get('per_page', 50))
page = int(request.args.get('page', 1))
if page < 1:
page = 1
offset = (page - 1) * per_page
total_count = db.count_documents(source=source, category=category)
total_pages = max(1, (total_count + per_page - 1) // per_page)
if page > total_pages:
page = total_pages
offset = (page - 1) * per_page
docs = db.get_all_documents(source=source, category=category, limit=per_page, offset=offset)
sources = db.get_sources()
return render_template('knowledge/catalogue.html',
domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/catalogue',
docs=docs, sources=sources, current_source=source,
page=page, per_page=per_page, total_pages=total_pages, total_count=total_count)
@app.route('/upload')
def upload_page():
db = StatusDB()
config = get_config()
upload_paths = config.get('upload_paths', {})
categories = sorted(k for k in upload_paths if k != 'default')
db_sources = db.get_sources()
for s in db_sources:
if s not in categories:
categories.append(s)
options_html = ''.join(f'