Merge refactor branch: RECON v1.0.0

This merge integrates the complete refactor effort spanning Phases 0-6k, bringing RECON from its initial baseline into production-grade form. Pipeline architecture --------------------- - Phases 0-2: foundation cleanup, removed dead code, standardized logging - Phase 3: dispatcher rewrite — watches data/acquired/<subfolder>/ for {hash}.txt + {hash}.meta.json pairs, atomic .tmp rename, idempotent - Phase 4: content processors for PDF (PyPDF2 -> pdftotext -> Tesseract -> Gemini Vision fallback chain), transcript, and text formats - Phase 5: enrichment, embedding, and filing daemons split into independently restartable threads PeerTube acquisition -------------------- - Phase 6a-6c: PeerTube channel watcher, caption acquisition with rate limiting (429 handling), 0.5s rate_limit_delay enforced - Phase 6d: multi-instance support - Phase 6e: rewired then reverted dashboard PeerTube endpoint to live in acquisition module Format handling & library cleanup --------------------------------- - Phase 6f: text processor for .txt ingestion - Phase 6f-2: format normalizer in dispatcher - Phase 6g-6j: library reorg — ghost domain cleanup, SCL moved to dedicated domain folder, pi-nas fully decommissioned as a storage target (NFS-only now), ~73 GB reclaimed - Phase 6k: hash-identical dedup — 2,477 duplicate PDFs removed, 22.05 GB freed, catalogue/documents/Qdrant payloads updated coherently, 226 empty domain subdirs pruned - 16,340 transcripts remain un-filed pending title-match review Dashboard & metadata -------------------- - Gemini "null" string bug fixed in pdf_processor metadata voting - Dashboard upload migrated to pipeline with multi-format support State at release ---------------- - 7 daemon threads: dispatcher, enrich, embed, filing, peertube-acq, progress, dashboard - 29,201 documents in catalogue / documents tables (UNIQUE on hash PK) - ~2.1M Qdrant vectors in recon_knowledge_hybrid (cortex:6333) - ~67 GB library on /mnt/library (NFS from pi-nas) - files.echo6.co serving 9,397 deduped PDFs - recon.echo6.co dashboard + API on :8420 See cleanup-log.md for the full backlog and resolution history.
2026-05-20 06:34:40 +02:00 · 2026-04-16 18:20:25 +00:00 · 2026-04-16 18:20:25 +00:00 · 8d54ff165d
commit 8d54ff165d
parent 563c16bb71 e6224cb279
18 changed files with 1982 additions and 849 deletions
--- a/config.yaml
+++ b/config.yaml
@ -411,6 +411,7 @@ peertube:
  public_url: https://stream.echo6.co  # Public URL for video links
  fetch_timeout: 30                     # HTTP timeout for API/VTT requests
  rate_limit_delay: 0.5                 # Delay between video ingestions (seconds)
  poll_interval: 1800                    # Seconds between PeerTube acquisition polls (30 min)
 # Stream B: New Library Pipeline
 new_pipeline:
@ -436,5 +437,6 @@ pipeline:
    pdf: pdf_processor
    stream: transcript_processor
    html: html_processor
    text: text_processor
  # mtime stability threshold for picking up files from acquired/
  mtime_stability_seconds: 10
--- a/lib/acquisition/init.py
+++ b/lib/acquisition/init.py
--- a/lib/acquisition/peertube.py
+++ b/lib/acquisition/peertube.py
@ -0,0 +1,224 @@
 """
 RECON PeerTube Acquisition Module
 Polls PeerTube for new video transcripts and writes them as flat file pairs
 into data/acquired/stream/ for the dispatcher to pick up.
 Does NOT touch the database — that's transcript_processor's job.
 """
 import json
 import os
 import time
 from lib.peertube_scraper import get_videos, get_captions, fetch_vtt, vtt_to_text, _get_pt_config
 from lib.utils import content_hash, get_config, setup_logging
 logger = setup_logging("recon.acquisition.peertube")
 def _build_known_sets(db):
    """Build sets of known UUIDs and titles from catalogue.
    Queries catalogue once per batch for dedup against both cohorts:
    - URL-path rows: extract UUID from https://stream.echo6.co/w/{uuid}
    - Library-path rows: extract title from filename column
    """
    conn = db._get_conn()
    rows = conn.execute(
        "SELECT path, filename FROM catalogue WHERE source = 'stream.echo6.co'"
    ).fetchall()
    known_uuids = set()
    known_titles = set()
    for row in rows:
        path = row['path'] or ''
        if '/w/' in path:
            known_uuids.add(path.rsplit('/w/', 1)[-1])
        fname = row['filename'] or ''
        if fname.endswith('.txt'):
            known_titles.add(fname[:-4])
        else:
            known_titles.add(fname)
    return known_uuids, known_titles
 def list_new_videos(db, config=None):
    """Find PeerTube videos with captions not yet in catalogue.
    Returns list of (video_dict, caption_path) tuples for videos that have
    captions and are not in the known UUID or title sets.
    """
    if config is None:
        config = get_config()
    ptc = _get_pt_config(config)
    rate_delay = ptc.get('rate_limit_delay', 0.5)
    known_uuids, known_titles = _build_known_sets(db)
    videos = get_videos(config=config)
    new_videos = []
    checked = 0
    for video in videos:
        if video['uuid'] in known_uuids:
            continue
        if video['name'] in known_titles:
            continue
        # Rate limit caption API calls
        if checked > 0:
            time.sleep(rate_delay)
        checked += 1
        try:
            captions = get_captions(video['uuid'], config)
        except Exception as e:
            logger.warning("[peertube] Failed to get captions for %s: %s",
                           video['uuid'][:8], e)
            continue
        if not captions:
            continue
        # Prefer English caption
        caption_path = None
        for c in captions:
            if c.get('language', {}).get('id') == 'en':
                caption_path = c['captionPath']
                break
        if caption_path is None:
            caption_path = captions[0]['captionPath']
        new_videos.append((video, caption_path))
    return new_videos
 def acquire_one(video, caption_path, config=None):
    """Fetch transcript and write to hopper as flat files.
    Returns hash string on success, None on skip/error.
    Does NOT touch the database — that's transcript_processor's job.
    """
    if config is None:
        config = get_config()
    ptc = _get_pt_config(config)
    pipeline_cfg = config.get('pipeline', {})
    hopper_dir = os.path.join(
        pipeline_cfg.get('acquired_root', '/opt/recon/data/acquired'),
        'stream'
    )
    os.makedirs(hopper_dir, exist_ok=True)
    uuid = video['uuid']
    # Fetch and convert VTT
    vtt_content = fetch_vtt(caption_path, config)
    text, cue_timestamps = vtt_to_text(vtt_content)
    if not text or len(text.strip()) < 50:
        logger.debug("[peertube] Transcript too short for %s (%s): %d chars",
                     video['name'], uuid, len(text) if text else 0)
        return None
    # Write text to temp file, hash it, then rename to final name
    tmp_txt = os.path.join(hopper_dir, f'{uuid}.txt.tmp')
    with open(tmp_txt, 'w', encoding='utf-8') as f:
        f.write(text)
    file_hash = content_hash(tmp_txt)
    # Check if final file already exists (race condition guard)
    final_txt = os.path.join(hopper_dir, f'{file_hash}.txt')
    final_meta = os.path.join(hopper_dir, f'{file_hash}.meta.json')
    if os.path.exists(final_txt):
        os.remove(tmp_txt)
        logger.debug("[peertube] Hopper file already exists: %s", file_hash[:8])
        return None
    # Build sidecar metadata
    video_url = f"{ptc['public_url']}/w/{uuid}"
    meta = {
        'title': video['name'],
        'source_url': video_url,
        'url': video_url,
        'source': 'stream.echo6.co',
        'source_type': 'transcript',
        'category': 'Transcript',
        'channel': video.get('channel_display', ''),
        'duration': video.get('duration', 0),
        'uuid': uuid,
        'cue_timestamps': cue_timestamps,
    }
    # Write meta to tmp, then rename both atomically
    # Meta first, then content — dispatcher only picks up when content file exists
    tmp_meta = os.path.join(hopper_dir, f'{file_hash}.meta.json.tmp')
    with open(tmp_meta, 'w', encoding='utf-8') as f:
        json.dump(meta, f, indent=2)
    os.rename(tmp_meta, final_meta)
    os.rename(tmp_txt, final_txt)
    logger.info("[peertube] Acquired: %s (%s) -> %s",
                video['name'], uuid[:8], file_hash[:12])
    return file_hash
 def acquire_batch(db, config=None):
    """One-shot: find new videos and acquire them.
    Returns dict: {'acquired': N, 'skipped': N, 'errors': N}
    """
    if config is None:
        config = get_config()
    ptc = _get_pt_config(config)
    rate_delay = ptc.get('rate_limit_delay', 0.5)
    result = {'acquired': 0, 'skipped': 0, 'errors': 0}
    try:
        new_videos = list_new_videos(db, config)
    except Exception as e:
        logger.error("[peertube] Failed to list new videos: %s", e, exc_info=True)
        result['errors'] = 1
        return result
    if not new_videos:
        logger.debug("[peertube] No new videos found")
        return result
    logger.info("[peertube] Found %d new videos to acquire", len(new_videos))
    for i, (video, caption_path) in enumerate(new_videos):
        if i > 0:
            time.sleep(rate_delay)
        try:
            file_hash = acquire_one(video, caption_path, config)
            if file_hash:
                result['acquired'] += 1
            else:
                result['skipped'] += 1
        except Exception as e:
            logger.error("[peertube] Error acquiring %s (%s): %s",
                         video['name'], video['uuid'][:8], e, exc_info=True)
            result['errors'] += 1
    return result
 def acquisition_loop(stop_event, db, config, interval=1800):
    """Service loop: poll PeerTube for new transcripts every interval seconds."""
    logger.info("[peertube] Acquisition loop started (interval: %ds)", interval)
    while not stop_event.is_set():
        try:
            result = acquire_batch(db, config)
            if result['acquired']:
                logger.info("[peertube] Acquired %d new transcripts (%d skipped, %d errors)",
                            result['acquired'], result['skipped'], result['errors'])
            else:
                logger.debug("[peertube] No new transcripts")
        except Exception as e:
            logger.error("[peertube] Error: %s", e, exc_info=True)
        stop_event.wait(interval)
    logger.info("[peertube] Acquisition loop stopped")
--- a/lib/api.py
+++ b/lib/api.py
@ -9,6 +9,7 @@ API endpoints for all pipeline operations including crawl, ingest, and search.
 Dependencies: Flask, qdrant-client, requests
 Config: web, vector_db, embedding sections of config.yaml
 """
 import glob
 import json
 import threading
 import os
@ -77,25 +78,20 @@ def _format_source_citation(payload):
    return book
-def _resolve_upload_path(category, config):
+ALLOWED_EXTENSIONS = {'.pdf', '.txt', '.epub', '.doc', '.docx', '.mobi'}
    """Resolve the target directory for an upload given a category name."""
    upload_paths = config.get('upload_paths', {})
    library_root = config['library_root']
-    if category in upload_paths:
+HOPPER_ROUTING = {
-        return upload_paths[category]
+    '.pdf':  '/opt/recon/data/acquired/pdf/',
-
+    '.txt':  '/opt/recon/data/acquired/text/',
-    default_path = upload_paths.get('default', library_root)
+    '.epub': '/opt/recon/data/acquired/pdf/',
-    safe_category = secure_filename(category) if category else ''
+    '.doc':  '/opt/recon/data/acquired/pdf/',
-    if safe_category:
+    '.docx': '/opt/recon/data/acquired/pdf/',
-        return os.path.join(default_path, safe_category)
+    '.mobi': '/opt/recon/data/acquired/pdf/',
-    return default_path
+}
-def _process_upload(filepath, original_filename, category, config, db):
+def _process_upload(filepath, original_filename, ext, category, config, db):
-    """Process a single PDF upload: hash, dedup, copy to library, catalogue, queue."""
+    """Process an upload: hash, dedup, drop into hopper for dispatcher pickup."""
    library_root = config['library_root']
    file_hash = content_hash(filepath)
    conn = db._get_conn()
@ -103,34 +99,39 @@ def _process_upload(filepath, original_filename, category, config, db):
    if existing:
        raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}")
-    target_dir = _resolve_upload_path(category, config)
+    # Also check if already sitting in a hopper dir awaiting dispatch
-    os.makedirs(target_dir, exist_ok=True)
+    for hopper in HOPPER_ROUTING.values():
        if any(os.path.exists(os.path.join(hopper, file_hash + e)) for e in ALLOWED_EXTENSIONS):
            raise ValueError("Duplicate: file already queued for processing")
-    safe_name = secure_filename(original_filename)
+    hopper_dir = HOPPER_ROUTING.get(ext, '/opt/recon/data/acquired/pdf/')
-    if not safe_name:
+    os.makedirs(hopper_dir, exist_ok=True)
        safe_name = f"{file_hash}.pdf"
    target_path = os.path.join(target_dir, safe_name)
-    if os.path.exists(target_path):
+    target_path = os.path.join(hopper_dir, file_hash + ext)
-        base, ext = os.path.splitext(safe_name)
+    meta_path = os.path.join(hopper_dir, file_hash + '.meta.json')
-        target_path = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}")
+
    stem = os.path.splitext(original_filename)[0]
    sidecar = {
        'title': stem,
        'source': 'dashboard_upload',
        'source_type': ext.lstrip('.'),
        'category': category,
        'original_filename': original_filename,
    }
    # Write sidecar first (with .tmp safety), then content
    tmp_meta = meta_path + '.tmp'
    with open(tmp_meta, 'w', encoding='utf-8') as f:
        json.dump(sidecar, f, indent=2)
    os.rename(tmp_meta, meta_path)
    shutil.copy2(filepath, target_path)
    size = os.path.getsize(target_path)
    source, derived_category = derive_source_and_category(target_path, library_root)
    db.add_to_catalogue(file_hash, safe_name, target_path, size, source, derived_category)
    db.queue_document(file_hash)
    return {
        'hash': file_hash,
-        'filename': safe_name,
+        'filename': original_filename,
-        'category': derived_category,
+        'source_type': ext.lstrip('.'),
-        'source': source,
+        'status': 'queued',
        'path': target_path,
        'size_bytes': size,
        'status': 'queued'
    }
@ -346,22 +347,23 @@ def api_upload():
    if not file.filename:
        return jsonify({'error': 'No file selected'}), 400
-    if not file.filename.lower().endswith('.pdf'):
+    ext = os.path.splitext(file.filename)[1].lower()
-        return jsonify({'error': 'Only PDF files are accepted'}), 400
+    if ext not in ALLOWED_EXTENSIONS:
        return jsonify({'error': f'Unsupported file type: {ext}'}), 400
    category = request.form.get('category', '').strip()
    config = get_config()
    db = StatusDB()
-    tmp_fd, tmp_path = tempfile.mkstemp(suffix='.pdf')
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=ext)
    try:
        file.save(tmp_path)
        if os.path.getsize(tmp_path) == 0:
            return jsonify({'error': 'Uploaded file is empty'}), 400
-        result = _process_upload(tmp_path, file.filename, category, config, db)
+        result = _process_upload(tmp_path, file.filename, ext, category, config, db)
        return jsonify(result), 201
    except ValueError as e:
@ -390,6 +392,28 @@ def api_upload_status(doc_hash):
                'filename': cat['filename'],
                'status': cat['status'],
            })
        # Check hopper dirs for files awaiting dispatcher pickup
        for hopper in ('/opt/recon/data/acquired/pdf/', '/opt/recon/data/acquired/text/'):
            if glob.glob(os.path.join(hopper, doc_hash + '.*')):
                return jsonify({
                    'hash': doc_hash,
                    'status': 'pending',
                    'message': 'Waiting for dispatcher',
                })
        # Check processing dir
        proc_dir = os.path.join(
            config.get('pipeline', {}).get('processing_root', '/opt/recon/data/processing'),
            doc_hash,
        )
        if os.path.isdir(proc_dir):
            return jsonify({
                'hash': doc_hash,
                'status': 'processing',
                'message': 'Being processed',
            })
        return jsonify({'error': 'Document not found'}), 404
    result = {
@ -882,7 +906,11 @@ def _build_knowledge_stats():
    sources = conn.execute("""
        SELECT
            c.source,
-            CASE WHEN c.path LIKE 'http%' THEN 'web' ELSE 'pdf' END as type,
+            CASE
              WHEN c.source = 'stream.echo6.co' THEN 'transcript'
              WHEN c.path LIKE 'http%' THEN 'web'
              ELSE 'pdf'
            END as type,
            COUNT(DISTINCT c.hash) as catalogued,
            COUNT(DISTINCT CASE WHEN d.status = 'complete' THEN d.hash END) as complete,
            COUNT(DISTINCT CASE WHEN d.status NOT IN ('complete', 'failed') AND d.status IS NOT NULL THEN d.hash END) as in_pipeline,
@ -935,11 +963,17 @@ def _build_knowledge_stats():
    """).fetchone()[0]
    recent = conn.execute("""
-        SELECT book_title, status, concepts_extracted, vectors_inserted,
+        SELECT COALESCE(d.book_title, c.filename) as title,
-               CASE WHEN path LIKE 'http%' THEN 'web' ELSE 'pdf' END as type
+               d.status, d.concepts_extracted, d.vectors_inserted,
-        FROM documents
+               CASE
-        WHERE status = 'complete'
+                 WHEN c.source = 'stream.echo6.co' THEN 'transcript'
-        ORDER BY embedded_at DESC
+                 WHEN d.path LIKE 'http%' THEN 'web'
                 ELSE 'pdf'
               END as type
        FROM documents d
        JOIN catalogue c ON d.hash = c.hash
        WHERE d.status = 'complete'
        ORDER BY d.embedded_at DESC
        LIMIT 10
    """).fetchall()
@ -979,7 +1013,7 @@ def _build_knowledge_stats():
        'source_types': dict(sorted(source_type_counts.items(), key=lambda x: -x[1])),
        'sample_size': sample_size,
        'recent_complete': [{
-            'title': r['book_title'] or 'Untitled',
+            'title': r['title'] or 'Untitled',
            'concepts': r['concepts_extracted'] or 0,
            'vectors': r['vectors_inserted'] or 0,
            'type': r['type'],
--- a/lib/crawler.py
+++ b/lib/crawler.py
@ -1,432 +0,0 @@
 """
 RECON Site Crawler — URL discovery for bulk web ingestion.
 Two discovery strategies:
 1. Sitemap-based (preferred) — parses sitemap.xml for all URLs
 2. Link-following (fallback) — crawls from root URL following internal links
 Discovered URLs are fed into web_scraper.ingest_url() for processing.
 """
 import re
 import time
 from collections import deque
 from urllib.parse import urlparse, urljoin, urldefrag
 import requests
 from lxml import etree
 from .utils import get_config, setup_logging
 logger = setup_logging('recon.crawler')
 def _get_crawler_config(config=None):
    """Load crawler config with defaults."""
    if config is None:
        config = get_config()
    crawler_cfg = config.get('crawler', {})
    web_cfg = config.get('web_scraper', {})
    return {
        'user_agent': (
            crawler_cfg.get('user_agent') or
            web_cfg.get('user_agent') or
            'Mozilla/5.0 (compatible; RECON/1.0)'
        ),
        'fetch_timeout': crawler_cfg.get('fetch_timeout', 30),
        'rate_limit_delay': crawler_cfg.get('rate_limit_delay', 1.0),
        'max_pages': crawler_cfg.get('max_pages', 500),
        'max_depth': crawler_cfg.get('max_depth', 3),
        'default_exclude': crawler_cfg.get('default_exclude', [
            '/search', '/404', '/login', '/signup', '/auth/', '/api/', '/assets/', '/static/'
        ]),
    }
 # ─── Sitemap Discovery ─────────────────────────────────────────────
 def discover_sitemap_url(base_url, config=None):
    """
    Find the sitemap URL for a site.
    Checks: robots.txt Sitemap: directive, /sitemap.xml,
    /sitemap_index.xml, /sitemap-0.xml.
    Returns sitemap URL or None.
    """
    cfg = _get_crawler_config(config)
    headers = {'User-Agent': cfg['user_agent']}
    parsed = urlparse(base_url)
    root = f"{parsed.scheme}://{parsed.netloc}"
    # Check robots.txt first
    try:
        resp = requests.get(
            f"{root}/robots.txt",
            headers=headers,
            timeout=cfg['fetch_timeout']
        )
        if resp.status_code == 200:
            for line in resp.text.splitlines():
                if line.strip().lower().startswith('sitemap:'):
                    sitemap_url = line.split(':', 1)[1].strip()
                    # Handle "Sitemap: https://..." — split(':',1) keeps the URL intact
                    # but "Sitemap: https://..." splits into "Sitemap" and " https://..."
                    # Need to rejoin properly
                    if not sitemap_url.startswith('http'):
                        sitemap_url = line[line.index(':') + 1:].strip()
                    logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
                    return sitemap_url
    except Exception as e:
        logger.debug(f"robots.txt fetch failed: {e}")
    # Try common sitemap locations
    candidates = [
        f"{root}/sitemap.xml",
        f"{root}/sitemap_index.xml",
        f"{root}/sitemap-0.xml",
    ]
    for url in candidates:
        try:
            resp = requests.head(
                url,
                headers=headers,
                timeout=cfg['fetch_timeout'],
                allow_redirects=True
            )
            if resp.status_code == 200:
                logger.info(f"Found sitemap at: {url}")
                return url
        except Exception:
            continue
    logger.warning(f"No sitemap found for {base_url}")
    return None
 def parse_sitemap(sitemap_url, config=None):
    """
    Parse a sitemap XML and return all page URLs.
    Handles standard sitemaps (<urlset>) and sitemap indexes
    (<sitemapindex>) with recursive sub-sitemap fetching.
    """
    cfg = _get_crawler_config(config)
    headers = {'User-Agent': cfg['user_agent']}
    all_urls = []
    def _fetch_and_parse(url, depth=0):
        if depth > 3:
            return
        try:
            resp = requests.get(url, headers=headers, timeout=cfg['fetch_timeout'])
            resp.raise_for_status()
        except Exception as e:
            logger.error(f"Failed to fetch sitemap {url}: {e}")
            return
        try:
            root = etree.fromstring(resp.content)
        except etree.XMLSyntaxError as e:
            logger.error(f"Invalid XML in sitemap {url}: {e}")
            return
        nsmap = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        # Check if this is a sitemap index
        sitemap_locs = root.findall('.//ns:sitemap/ns:loc', nsmap)
        if sitemap_locs:
            logger.info(f"Sitemap index at {url} — {len(sitemap_locs)} sub-sitemaps")
            for loc in sitemap_locs:
                if loc.text:
                    _fetch_and_parse(loc.text.strip(), depth + 1)
            return
        # Standard sitemap — extract URLs
        url_locs = root.findall('.//ns:loc', nsmap)
        # Fallback: try without namespace
        if not url_locs:
            url_locs = root.findall('.//loc')
        for loc in url_locs:
            if loc.text:
                all_urls.append(loc.text.strip())
        logger.info(f"Parsed {len(url_locs)} URLs from {url}")
    _fetch_and_parse(sitemap_url)
    # Deduplicate preserving order
    seen = set()
    unique = []
    for url in all_urls:
        url_clean = urldefrag(url)[0]
        if url_clean not in seen:
            seen.add(url_clean)
            unique.append(url_clean)
    logger.info(f"Total unique URLs from sitemap: {len(unique)}")
    return unique
 # ─── Link-Following Discovery (Fallback) ───────────────────────────
 def crawl_links(base_url, max_depth=3, max_pages=500, config=None):
    """
    Discover URLs by following internal links (BFS).
    Fallback when no sitemap is available.
    """
    from bs4 import BeautifulSoup
    cfg = _get_crawler_config(config)
    headers = {'User-Agent': cfg['user_agent']}
    parsed_base = urlparse(base_url)
    base_domain = parsed_base.netloc
    discovered = []
    visited = set()
    queue = deque([(base_url, 0)])
    skip_extensions = (
        '.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg',
        '.css', '.js', '.zip', '.tar', '.gz', '.mp4', '.mp3',
        '.ico', '.woff', '.woff2', '.ttf', '.eot',
    )
    skip_paths = (
        '/tag/', '/tags/', '/page/', '/feed/', '/rss/',
        '/wp-json/', '/wp-admin/', '/wp-includes/',
    )
    while queue and len(discovered) < max_pages:
        url, depth = queue.popleft()
        url = urldefrag(url)[0]
        if url in visited:
            continue
        if depth > max_depth:
            continue
        visited.add(url)
        discovered.append(url)
        if depth >= max_depth:
            continue
        try:
            resp = requests.get(url, headers=headers, timeout=cfg['fetch_timeout'])
            if resp.status_code != 200:
                continue
            if 'text/html' not in resp.headers.get('content-type', ''):
                continue
        except Exception:
            continue
        try:
            soup = BeautifulSoup(resp.text, 'lxml')
        except Exception:
            continue
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_url = urljoin(url, href)
            full_url = urldefrag(full_url)[0]
            parsed = urlparse(full_url)
            if parsed.netloc != base_domain:
                continue
            if any(parsed.path.lower().endswith(ext) for ext in skip_extensions):
                continue
            if any(skip in parsed.path.lower() for skip in skip_paths):
                continue
            if full_url not in visited:
                queue.append((full_url, depth + 1))
        time.sleep(cfg['rate_limit_delay'])
    logger.info(f"Link crawl: {len(discovered)} URLs (visited {len(visited)}, depth {max_depth})")
    return discovered
 # ─── URL Filtering ──────────────────────────────────────────────────
 def filter_urls(urls, include=None, exclude=None):
    """
    Filter URLs by path prefix include/exclude rules.
    include: URL must match at least one prefix (if provided)
    exclude: URL must not match any prefix
    """
    filtered = []
    for url in urls:
        path = urlparse(url).path
        if include:
            if not any(path.startswith(prefix) for prefix in include):
                continue
        if exclude:
            if any(path.startswith(prefix) for prefix in exclude):
                continue
        filtered.append(url)
    logger.info(f"Filtered {len(urls)} -> {len(filtered)} URLs "
                f"(include={include}, exclude={exclude})")
    return filtered
 # ─── Main Crawl Orchestrator ────────────────────────────────────────
 def crawl_site(
    base_url,
    category='Web',
    source=None,
    include=None,
    exclude=None,
    max_pages=None,
    max_depth=None,
    delay=None,
    dry_run=False,
    use_sitemap=True,
    use_links=True,
    config=None,
 ):
    """
    Crawl a site and ingest all discovered pages.
    1. Discover URLs via sitemap or link-following
    2. Apply include/exclude filters
    3. Feed each URL through web_scraper.ingest_url()
    Returns summary dict with counts and per-URL results.
    """
    if config is None:
        config = get_config()
    cfg = _get_crawler_config(config)
    if max_pages is None:
        max_pages = cfg['max_pages']
    if max_depth is None:
        max_depth = cfg['max_depth']
    if delay is None:
        delay = cfg['rate_limit_delay']
    if source is None:
        source = urlparse(base_url).netloc
    logger.info(f"Crawling {base_url} (category={category}, max_pages={max_pages})")
    # ── Phase 1: Discover URLs ──
    urls = []
    discovery_method = None
    if use_sitemap:
        sitemap_url = discover_sitemap_url(base_url, config)
        if sitemap_url:
            urls = parse_sitemap(sitemap_url, config)
            discovery_method = 'sitemap'
    if not urls and use_links:
        logger.info("No sitemap URLs, falling back to link crawl...")
        urls = crawl_links(base_url, max_depth=max_depth, max_pages=max_pages, config=config)
        discovery_method = 'link_crawl'
    if not urls:
        logger.warning(f"No URLs discovered for {base_url}")
        return {
            'site': base_url,
            'discovery_method': None,
            'urls_discovered': 0,
            'urls_after_filter': 0,
            'results': [],
            'summary': {'total': 0, 'succeeded': 0, 'duplicates': 0, 'failed': 0},
        }
    # ── Phase 2: Filter URLs ──
    all_exclude = list(cfg['default_exclude'])
    if exclude:
        all_exclude.extend(exclude)
    urls = filter_urls(urls, include=include, exclude=all_exclude)
    if len(urls) > max_pages:
        logger.info(f"Limiting to {max_pages} pages (discovered {len(urls)})")
        urls = urls[:max_pages]
    logger.info(f"After filtering: {len(urls)} URLs to process")
    # ── Dry run ──
    if dry_run:
        return {
            'site': base_url,
            'discovery_method': discovery_method,
            'dry_run': True,
            'urls_discovered': len(urls),
            'urls': urls,
        }
    # ── Phase 3: Ingest each URL ──
    from .web_scraper import ingest_url
    results = []
    total = len(urls)
    for i, url in enumerate(urls, 1):
        logger.info(f"[{i}/{total}] Ingesting: {url}")
        try:
            result = ingest_url(url, category=category, source=source, config=config)
            result['url'] = url
            results.append(result)
            status = result.get('status', 'unknown')
            title = result.get('title', '')
            if status == 'duplicate':
                logger.info(f"  DUPLICATE: {title}")
            else:
                logger.info(f"  OK: {title} ({result.get('page_count', 0)} pages)")
        except Exception as e:
            logger.error(f"  FAILED: {url} -- {e}")
            results.append({
                'url': url,
                'status': 'failed',
                'error': str(e),
            })
        if i < total and delay > 0:
            time.sleep(delay)
    # ── Summary ──
    succeeded = sum(1 for r in results if r.get('status') not in ('failed', 'duplicate'))
    duplicates = sum(1 for r in results if r.get('status') == 'duplicate')
    failed = sum(1 for r in results if r.get('status') == 'failed')
    summary = {
        'total': len(results),
        'succeeded': succeeded,
        'duplicates': duplicates,
        'failed': failed,
    }
    logger.info(f"Crawl complete: {succeeded} new, {duplicates} duplicates, {failed} failed out of {total}")
    return {
        'site': base_url,
        'domain': urlparse(base_url).netloc,
        'category': category,
        'discovery_method': discovery_method,
        'urls_discovered': total,
        'results': results,
        'summary': summary,
    }
--- a/lib/dispatcher.py
+++ b/lib/dispatcher.py
@ -0,0 +1,237 @@
 """
 RECON Dispatcher
 Watches configured acquired/<subfolder>/ directories for content+sidecar pairs
 that have been stable (mtime unchanged) for the configured threshold, then
 hands them to the appropriate processor's pre_flight().
 Phase 3: importable one-shot dispatcher. Service-loop integration in Phase 5.
 Phase 4: sidecar is optional (PDFs may arrive without .meta.json).
 Phase 6f-2: format normalizer converts non-standard formats to PDF before dispatch.
 """
 import importlib
 import logging
 import os
 import subprocess
 import time
 from .utils import get_config
 from .status import StatusDB
 logger = logging.getLogger("recon.dispatcher")
 # Content file extensions recognized by the dispatcher
 CONTENT_EXTENSIONS = {'.txt', '.vtt', '.html', '.pdf'}
 # Non-standard formats that can be converted to PDF before dispatch
 CONVERTIBLE_EXTENSIONS = {'.epub', '.mobi', '.doc', '.docx'}
 def _load_processor(processor_name):
    """Dynamically import a processor module from lib.processors."""
    module_path = f"lib.processors.{processor_name}"
    try:
        return importlib.import_module(module_path)
    except ModuleNotFoundError:
        logger.debug("Processor module not found: %s (not yet implemented)", processor_name)
        return None
    except ImportError as e:
        logger.error("Failed to import processor %s: %s", processor_name, e)
        return None
 def _normalize_formats(subfolder_path):
    """Convert non-standard document formats to PDF before dispatch.
    Walks the subfolder for files with convertible extensions (.epub, .mobi,
    .doc, .docx). Converts each to PDF using the appropriate tool, then
    deletes the original.
    Returns count of files converted.
    """
    if not os.path.isdir(subfolder_path):
        return 0
    converted = 0
    for fname in sorted(os.listdir(subfolder_path)):
        stem, ext = os.path.splitext(fname)
        if ext.lower() not in CONVERTIBLE_EXTENSIONS:
            continue
        source = os.path.join(subfolder_path, fname)
        target = os.path.join(subfolder_path, stem + '.pdf')
        if os.path.exists(target):
            logger.debug("Target PDF already exists, skipping: %s", fname)
            continue
        try:
            if ext.lower() in ('.epub', '.mobi'):
                subprocess.run(
                    ['ebook-convert', source, target],
                    capture_output=True, check=True, timeout=300,
                )
            elif ext.lower() in ('.doc', '.docx'):
                subprocess.run(
                    ['libreoffice', '--headless', '--convert-to', 'pdf',
                     '--outdir', subfolder_path, source],
                    capture_output=True, check=True, timeout=300,
                )
            if os.path.isfile(target) and os.path.getsize(target) > 0:
                os.remove(source)
                converted += 1
                logger.info("Converted %s -> %s.pdf", fname, stem)
            else:
                logger.warning("Conversion produced no output: %s", fname)
        except subprocess.TimeoutExpired:
            logger.error("Conversion timed out: %s", fname)
        except subprocess.CalledProcessError as e:
            logger.error("Conversion failed for %s: %s", fname,
                         e.stderr.decode(errors='replace')[:200] if e.stderr else str(e))
        except Exception as e:
            logger.error("Unexpected error converting %s: %s", fname, e)
    return converted
 def _find_pairs(subfolder_path):
    """Find content files (with optional sidecar) in a subfolder.
    A pair is:
      <basename>.<ext>       — content file
      <basename>.meta.json   — optional sidecar
    Returns list of (content_path, meta_path_or_None, basename) tuples.
    """
    if not os.path.isdir(subfolder_path):
        return []
    files = set(os.listdir(subfolder_path))
    pairs = []
    seen_basenames = set()
    # First pass: find .meta.json files and their matching content
    for fname in sorted(files):
        if fname.endswith('.meta.json'):
            basename = fname[:-len('.meta.json')]
            for ext in sorted(CONTENT_EXTENSIONS):
                content_name = basename + ext
                if content_name in files:
                    pairs.append((
                        os.path.join(subfolder_path, content_name),
                        os.path.join(subfolder_path, fname),
                        basename,
                    ))
                    seen_basenames.add(content_name)
                    break
    # Second pass: find solo content files (no sidecar)
    for fname in sorted(files):
        if fname in seen_basenames:
            continue
        _stem, ext = os.path.splitext(fname)
        if ext.lower() in CONTENT_EXTENSIONS and not fname.endswith('.meta.json'):
            pairs.append((
                os.path.join(subfolder_path, fname),
                None,
                _stem,
            ))
    return pairs
 def _is_stable(filepath, stability_seconds):
    """Check if a file's mtime is older than stability_seconds ago."""
    try:
        mtime = os.path.getmtime(filepath)
        return (time.time() - mtime) >= stability_seconds
    except OSError:
        return False
 def dispatch_once():
    """One-shot dispatch: scan all configured acquired/ subfolders once.
    Returns list of result dicts from processor pre_flight calls.
    """
    config = get_config()
    pipeline_cfg = config.get('pipeline', {})
    acquired_root = pipeline_cfg.get('acquired_root', '/opt/recon/data/acquired')
    dispatch_map = pipeline_cfg.get('dispatch', {})
    stability_seconds = pipeline_cfg.get('mtime_stability_seconds', 10)
    db = StatusDB(config['paths']['db'])
    results = []
    for subfolder_name, processor_name in dispatch_map.items():
        subfolder_path = os.path.join(acquired_root, subfolder_name)
        processor = _load_processor(processor_name)
        if processor is None:
            continue
        if not hasattr(processor, 'pre_flight'):
            logger.error("Processor %s has no pre_flight function", processor_name)
            continue
        # Convert non-standard formats to PDF before scanning for pairs
        _normalize_formats(subfolder_path)
        pairs = _find_pairs(subfolder_path)
        if not pairs:
            continue
        for content_path, meta_path, basename in pairs:
            # Content file must be stable; sidecar too if present
            if not _is_stable(content_path, stability_seconds):
                logger.debug("File %s not yet stable, skipping", basename)
                continue
            if meta_path and not _is_stable(meta_path, stability_seconds):
                logger.debug("Sidecar for %s not yet stable, skipping", basename)
                continue
            logger.info("Dispatching %s/%s to %s", subfolder_name, basename, processor_name)
            try:
                result = processor.pre_flight(content_path, meta_path, db, config)
                results.append(result)
                logger.info("Result for %s: %s", basename, result.get('action', 'unknown'))
            except Exception as e:
                logger.error("Processor %s crashed on %s: %s", processor_name, basename, e)
                results.append({
                    'action': 'error',
                    'error': str(e),
                    'basename': basename,
                })
    return results
 def dispatch_loop(stop_event, db, config, interval=30):
    """Run dispatch_once() on a loop until stop_event is set.
    Designed to run as a service thread. Never raises to the caller.
    """
    logger.info("[dispatcher] Loop started (interval: %ds)", interval)
    while not stop_event.is_set():
        try:
            results = dispatch_once()
            if results:
                actions = {}
                for r in results:
                    a = r.get('action', 'unknown')
                    actions[a] = actions.get(a, 0) + 1
                logger.info("[dispatcher] Dispatched %d items: %s",
                            len(results),
                            ", ".join(f"{k}={v}" for k, v in sorted(actions.items())))
            else:
                logger.debug("[dispatcher] No items to dispatch")
        except Exception as e:
            logger.error("[dispatcher] Error in dispatch_once: %s", e, exc_info=True)
        stop_event.wait(interval)
    logger.info("[dispatcher] Loop stopped")
--- a/lib/embedder.py
+++ b/lib/embedder.py
@ -21,6 +21,7 @@ from qdrant_client.models import PointStruct, SparseVector
 from .utils import get_config, concept_id, generate_download_url, setup_logging
 from .status import StatusDB
 from .utils import resolve_text_dir
 logger = setup_logging('recon.embedder')
@ -274,7 +275,7 @@ def embed_single(file_hash, db, config):
        source_type = 'web' if is_web else 'document'
        # Check meta.json for explicit source_type (e.g. 'transcript')
-        text_dir = os.path.join(config['paths']['text'], file_hash)
+        text_dir = resolve_text_dir(file_hash, config, db)
        meta_path = os.path.join(text_dir, 'meta.json')
        page_timestamps = {}
        if os.path.exists(meta_path):
--- a/lib/enricher.py
+++ b/lib/enricher.py
@ -25,6 +25,7 @@ import google.generativeai as genai
 from .utils import get_config, setup_logging
 from .status import StatusDB
 from .utils import resolve_text_dir
 logger = setup_logging('recon.enricher')
@ -345,7 +346,7 @@ def enrich_single(file_hash, db, config, key_rotator):
    if not doc:
        return False
-    text_dir = os.path.join(config['paths']['text'], file_hash)
+    text_dir = resolve_text_dir(file_hash, config, db)
    concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
    window_size = config['processing']['enrich_window_size']
    delay = config['processing']['rate_limit_delay']
--- a/lib/filing.py
+++ b/lib/filing.py
@ -0,0 +1,215 @@
 """
 RECON shared filing logic.
 Provides file_processed_item() — the shared function that any processor
 can call to file a completed item from /opt/recon/data/processing/{hash}/
 into /mnt/library/Domain/Subdomain/{canonical_name}.{ext}.
 The function:
  1. Reads dominant domain from concept JSONs (existing logic)
  2. Derives canonical name (level 1, escalating to 2/3/4 only on collision)
  3. Moves the source file from processing/ to library/Domain/Subdomain/
  4. Updates catalogue + documents + Qdrant payloads atomically
  5. Marks organized
 This function does NOT extract, enrich, or embed. Those are upstream stages.
 This function does NOT touch the legacy organize_document() — that stays in place
 until cutover (Phase 5).
 Phase 2: function exists, is tested in isolation. Not yet called by anything
 in the service loop.
 """
 import logging
 import os
 import shutil
 from .organizer import determine_dominant_domain, _build_target_path
 from .new_pipeline import update_qdrant_payload
 logger = logging.getLogger("recon.filing")
 def file_processed_item(doc_hash, source_file_path, db, config, dry_run=False):
    """File a completed item into the library.
    Args:
        doc_hash: Document hash
        source_file_path: Current absolute path to the source file
            (typically in /opt/recon/data/processing/{hash}/ or current library path)
        db: StatusDB instance
        config: RECON config dict
        dry_run: If True, plan but don't move
    Returns:
        dict with keys:
            hash, action, source_path, target_path,
            domain, subdomain, qdrant_points_updated, error
    """
    result = {
        "hash": doc_hash,
        "action": "skip",
        "source_path": source_file_path,
        "target_path": None,
        "domain": None,
        "subdomain": None,
        "qdrant_points_updated": 0,
        "error": None,
    }
    # Verify source file exists
    if not os.path.exists(source_file_path):
        result["action"] = "error"
        result["error"] = f"Source file not found: {source_file_path}"
        return result
    # Determine domain from existing concept JSONs
    data_dir = config["paths"]["data"]
    domain, subdomain, confidence = determine_dominant_domain(doc_hash, data_dir)
    result["domain"] = domain
    result["subdomain"] = subdomain
    if domain is None:
        result["action"] = "skip_unclassified"
        return result
    # Get the original filename from catalogue
    conn = db._get_conn()
    row = conn.execute(
        "SELECT filename FROM catalogue WHERE hash = ?", (doc_hash,)
    ).fetchone()
    if not row:
        result["action"] = "error"
        result["error"] = f"Hash not in catalogue: {doc_hash}"
        return result
    original_filename = row["filename"]
    # Build target path using existing collision-handling logic
    library_root = config["library_root"]
    target_path, sanitized_name = _build_target_path(
        library_root, domain, subdomain, original_filename, doc_hash
    )
    if target_path is None:
        result["action"] = "skip_unclassified"
        return result
    # Fix 1.1: Preserve the source file's actual extension instead of
    # the default .pdf that sanitize_filename() may have applied
    source_ext = os.path.splitext(source_file_path)[1].lower()
    if source_ext:
        target_stem, _old_ext = os.path.splitext(target_path)
        target_path = target_stem + source_ext
        san_stem, _old_ext = os.path.splitext(sanitized_name)
        sanitized_name = san_stem + source_ext
    result["target_path"] = target_path
    # If already at target (idempotency), just mark organized
    if os.path.abspath(source_file_path) == os.path.abspath(target_path):
        result["action"] = "skip_already_filed"
        if not dry_run:
            db.mark_organized(doc_hash)
        return result
    if dry_run:
        result["action"] = "would_file"
        return result
    # Move the file
    try:
        target_dir = os.path.dirname(target_path)
        os.makedirs(target_dir, exist_ok=True)
        shutil.move(source_file_path, target_path)
    except Exception as e:
        result["action"] = "error"
        result["error"] = f"Move failed: {e}"
        logger.error("Move failed for %s: %s", doc_hash[:8], e)
        return result
    # Update DB and Qdrant
    try:
        db.update_catalogue_path(doc_hash, target_path, sanitized_name)
        db.sync_document_path(doc_hash, target_path, sanitized_name)
        db.mark_organized(doc_hash)
        # Update Qdrant payloads (download_url, filename, original_filename)
        points = update_qdrant_payload(
            doc_hash, target_path, sanitized_name, original_filename, config
        )
        result["qdrant_points_updated"] = points
        result["action"] = "filed"
        logger.info(
            "Filed %s -> %s [%s/%s, %d vectors]",
            doc_hash[:8],
            target_path,
            domain,
            subdomain,
            points,
        )
    except Exception as e:
        # File was moved but DB update failed — log the dangerous state
        result["action"] = "error"
        result["error"] = f"DB/Qdrant update failed after move: {e}"
        logger.error("DB/Qdrant update failed for %s: %s", doc_hash[:8], e)
    return result
 def filing_worker_loop(stop_event, db, config, interval=30):
    """Run filing on items ready to be filed until stop_event is set.
    Watches for documents with status='complete', organized_at IS NULL,
    and path in /opt/recon/data/processing/. Files them to library.
    Designed to run as a service thread. Never raises to the caller.
    """
    logger.info("[filing] Worker started (interval: %ds)", interval)
    while not stop_event.is_set():
        try:
            conn = db._get_conn()
            rows = conn.execute(
                "SELECT hash, path FROM documents "
                "WHERE status = 'complete' "
                "AND organized_at IS NULL "
                "AND path LIKE '/opt/recon/data/processing/%' "
                "LIMIT 50"
            ).fetchall()
            if rows:
                filed = 0
                skipped = 0
                errors = 0
                for row in rows:
                    if stop_event.is_set():
                        break
                    try:
                        result = file_processed_item(row['hash'], row['path'], db, config)
                        action = result.get('action', 'unknown')
                        if action == 'filed':
                            filed += 1
                        elif action.startswith('skip'):
                            skipped += 1
                        elif action == 'error':
                            errors += 1
                            logger.warning("[filing] Error filing %s: %s",
                                           row['hash'][:8], result.get('error', 'unknown'))
                    except Exception as e:
                        errors += 1
                        logger.error("[filing] Exception filing %s: %s",
                                     row['hash'][:8], e, exc_info=True)
                logger.info("[filing] Batch: %d filed, %d skipped, %d errors",
                            filed, skipped, errors)
            else:
                logger.debug("[filing] No items ready to file")
        except Exception as e:
            logger.error("[filing] Error in filing worker: %s", e, exc_info=True)
        stop_event.wait(interval)
    logger.info("[filing] Worker stopped")
--- a/lib/processors/init.py
+++ b/lib/processors/init.py
--- a/lib/processors/pdf_processor.py
+++ b/lib/processors/pdf_processor.py
@ -0,0 +1,665 @@
 """
 RECON PDF Processor
 Handles pre_flight for PDF content arriving in acquired/pdf/.
 Opens the PDF, extracts layered metadata (PDF dict + filename + Gemini),
 votes on fields, checks level-4 dedupe, runs full text extraction,
 and registers in the database.
 Metadata sources:
  A. PDF info dictionary (PyPDF2 reader.metadata)
  B. Filename parsing (clean_filename_to_title + regex patterns)
  C. Gemini LLM on first 3 pages of extracted text
 Voting: if 2+ sources agree on a field, use that value.
 Otherwise priority: C (Gemini) > A (PDF dict) > B (filename).
 Level-4 dedupe: requires ALL FOUR fields (title, author, edition, year)
 present and matching an existing document. Very conservative — only
 catches near-certain duplicates.
 Failure modes:
  - Transient (Gemini API): retry 3x with backoff, then continue without
  - Content (unreadable PDF): move to _review/rejected_pdfs/
  - Level-4 duplicate: move to _review/duplicate_quarantine/
 Phase 4: first implementation.
 """
 import json
 import logging
 import os
 import re
 import shutil
 import subprocess
 import time
 import google.generativeai as genai
 from PyPDF2 import PdfReader
 from lib.extractor import extract_text_from_page
 from lib.utils import content_hash, clean_filename_to_title, sanitize_filename
 logger = logging.getLogger("recon.processors.pdf")
 # Maximum retries for transient (API) failures
 MAX_TRANSIENT_RETRIES = 3
 TRANSIENT_BACKOFF_SECONDS = 30
 # ── Metadata Extraction Sources ─────────────────────────────────────
 def _extract_pdf_dict(reader):
    """Source A: Extract metadata from PDF's built-in info dictionary.
    Returns dict with keys: title, author, edition, year (any may be None).
    """
    result = {'title': None, 'author': None, 'edition': None, 'year': None}
    try:
        info = reader.metadata
        if not info:
            return result
    except Exception:
        return result
    # Title
    title = None
    try:
        title = info.get('/Title') or info.title
    except Exception:
        pass
    if title and isinstance(title, str) and len(title.strip()) > 2:
        result['title'] = title.strip()
    # Author
    author = None
    try:
        author = info.get('/Author') or info.author
    except Exception:
        pass
    if author and isinstance(author, str) and len(author.strip()) > 1:
        result['author'] = author.strip()
    # Year from CreationDate or ModDate
    for date_key in ('/CreationDate', '/ModDate'):
        try:
            date_val = info.get(date_key)
        except Exception:
            continue
        if date_val and isinstance(date_val, str):
            match = re.search(r'(?:D:)?(\d{4})', date_val)
            if match:
                year = int(match.group(1))
                if 1800 <= year <= 2030:
                    result['year'] = str(year)
                    break
    # Edition from Subject or Title
    for field_val in [info.get('/Subject'), title]:
        if field_val and isinstance(field_val, str):
            ed_match = re.search(
                r'(\d+)(?:st|nd|rd|th)?\s*(?:edition|ed\.?)',
                field_val, re.IGNORECASE
            )
            if ed_match:
                result['edition'] = ed_match.group(0).strip()
                break
    return result
 def _extract_filename_metadata(filename):
    """Source B: Extract metadata by parsing the filename.
    Returns dict with keys: title, author, edition, year (any may be None).
    """
    result = {'title': None, 'author': None, 'edition': None, 'year': None}
    stem = os.path.splitext(filename)[0]
    # Title from filename (using existing utility)
    result['title'] = clean_filename_to_title(filename)
    # Year: look for (YYYY) or [YYYY] or _YYYY_ or standalone YYYY near boundaries
    year_match = re.search(r'[\(\[_\s]((?:19|20)\d{2})[\)\]_\s.]', stem)
    if year_match:
        result['year'] = year_match.group(1)
    # Edition: look for Nth Edition, Edition N, etc.
    ed_match = re.search(
        r'(\d+)(?:st|nd|rd|th)?\s*(?:edition|ed\.?)',
        stem, re.IGNORECASE
    )
    if ed_match:
        result['edition'] = ed_match.group(0).strip()
    # Author: look for "by Author Name" pattern
    by_match = re.search(r'\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', stem)
    if by_match:
        result['author'] = by_match.group(1).strip()
    return result
 def _extract_gemini_metadata(pages_text, config):
    """Source C: Extract metadata using Gemini on first 3 pages text.
    Returns dict with keys: title, author, edition, year (any may be None).
    Retries up to MAX_TRANSIENT_RETRIES times on transient failures.
    """
    result = {'title': None, 'author': None, 'edition': None, 'year': None}
    keys = config.get('gemini_keys', [])
    if not keys or len(pages_text.strip()) < 50:
        return result
    prompt = (
        "Extract the following metadata from this book/document text.\n"
        'Return JSON: {"title": "...", "author": "...", "edition": "...", "year": "..."}\n'
        "- title: The full title of the book or document\n"
        "- author: The author(s) name(s)\n"
        '- edition: The edition (e.g. "2nd Edition", "Revised Edition")\n'
        "- year: The publication year (4-digit number as string)\n"
        "If a field cannot be determined, use null.\n\n"
        "Text from first pages:\n"
        + pages_text[:6000]
    )
    for attempt in range(MAX_TRANSIENT_RETRIES):
        try:
            key = keys[attempt % len(keys)]
            genai.configure(api_key=key)
            model = genai.GenerativeModel(
                config['gemini']['model'],
                generation_config={
                    "response_mime_type": config['gemini']['response_mime_type']
                }
            )
            response = model.generate_content(prompt)
            data = json.loads(response.text)
            for field in ('title', 'author', 'edition', 'year'):
                val = data.get(field)
                if val and isinstance(val, str) and val.strip() and val.strip().lower() != "null":
                    result[field] = val.strip()
            return result
        except Exception as e:
            logger.warning(
                "Gemini metadata attempt %d/%d failed: %s",
                attempt + 1, MAX_TRANSIENT_RETRIES, e
            )
            if attempt < MAX_TRANSIENT_RETRIES - 1:
                time.sleep(TRANSIENT_BACKOFF_SECONDS)
    logger.warning(
        "Gemini metadata extraction failed after %d attempts", MAX_TRANSIENT_RETRIES
    )
    return result
 # ── Voting ───────────────────────────────────────────────────────────
 def _vote_metadata(source_a, source_b, source_c):
    """Vote on each metadata field across three sources.
    Priority: C (Gemini) > A (PDF dict) > B (filename).
    If 2+ sources agree, use the agreed value.
    Returns:
        (voted_dict, provenance_dict)
        voted_dict:  {field: value_or_None}
        provenance_dict: {field: source_label_or_None}
    """
    sources = {'pdf_dict': source_a, 'filename': source_b, 'gemini': source_c}
    priority = ['gemini', 'pdf_dict', 'filename']
    voted = {}
    provenance = {}
    for field in ('title', 'author', 'edition', 'year'):
        values = {}
        for name, src in sources.items():
            val = src.get(field)
            if val and str(val).strip().lower() != "null":
                values[name] = val
        if not values:
            voted[field] = None
            provenance[field] = None
            continue
        # Normalize for comparison
        def _norm(v):
            if field == 'year':
                return v.strip()
            return v.strip().lower()
        # Group by normalized value
        norm_groups = {}
        for name, val in values.items():
            nv = _norm(val)
            norm_groups.setdefault(nv, []).append((name, val))
        # Find group with most agreement
        best_group = max(norm_groups.values(), key=len)
        if len(best_group) >= 2:
            # 2+ sources agree — use highest-priority original case
            for p in priority:
                for name, val in best_group:
                    if name == p:
                        voted[field] = val
                        names = ','.join(n for n, _ in best_group)
                        provenance[field] = "agreed({})".format(names)
                        break
                if voted.get(field) is not None:
                    break
        else:
            # No agreement — highest priority wins
            for p in priority:
                if p in values:
                    voted[field] = values[p]
                    provenance[field] = p
                    break
    return voted, provenance
 # ── Level-4 Dedupe ───────────────────────────────────────────────────
 def _check_level4_dedupe(voted, db):
    """Level-4 strict dedupe: all four fields must be present and match.
    Returns (is_duplicate, matching_hash) or (False, None).
    """
    title = voted.get('title')
    author = voted.get('author')
    edition = voted.get('edition')
    year = voted.get('year')
    if not all([title, author, edition, year]):
        return False, None
    conn = db._get_conn()
    rows = conn.execute(
        "SELECT hash, metadata_provenance FROM documents "
        "WHERE book_title = ? AND book_author = ?",
        (title, author)
    ).fetchall()
    if not rows:
        return False, None
    for row in rows:
        prov_json = row['metadata_provenance']
        if not prov_json:
            continue
        try:
            prov_data = json.loads(prov_json)
            existing_voted = prov_data.get('voted', {})
            ex_edition = existing_voted.get('edition', '')
            ex_year = existing_voted.get('year', '')
            if (ex_edition and ex_year
                    and ex_edition.lower() == edition.lower()
                    and ex_year == year):
                return True, row['hash']
        except (json.JSONDecodeError, AttributeError):
            continue
    return False, None
 # ── Page Count Fallback ──────────────────────────────────────────────
 def _pdfinfo_page_count(pdf_path):
    """Get page count via pdfinfo (poppler) when PdfReader fails."""
    try:
        proc = subprocess.run(
            ['pdfinfo', pdf_path],
            capture_output=True, text=True, timeout=30
        )
        if proc.returncode == 0:
            for line in proc.stdout.splitlines():
                if line.startswith('Pages:'):
                    return int(line.split(':', 1)[1].strip())
    except Exception:
        pass
    return 0
 # ── Pre-Flight ───────────────────────────────────────────────────────
 def pre_flight(content_path, meta_path, db, config):
    """Process a PDF from acquired/pdf/.
    Args:
        content_path: Path to the PDF file
        meta_path: Path to .meta.json sidecar (may be None)
        db: StatusDB instance
        config: RECON config dict
    Returns:
        dict with keys: hash, action, source_path, error
        Actions: 'extracted', 'duplicate', 'level4_duplicate',
                 'content_failure', 'error'
    """
    result = {
        'hash': None,
        'action': 'error',
        'source_path': content_path,
        'error': None,
    }
    filename = os.path.basename(content_path)
    # ── Step 1: Hash ──────────────────────────────────────────────
    try:
        file_hash = content_hash(content_path)
        result['hash'] = file_hash
    except Exception as e:
        result['error'] = "Cannot hash PDF: {}".format(e)
        return result
    # ── Step 2: Stale state cleanup ───────────────────────────────
    processing_root = config.get('pipeline', {}).get(
        'processing_root', '/opt/recon/data/processing'
    )
    proc_dir = os.path.join(processing_root, file_hash)
    concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
    if os.path.exists(proc_dir):
        try:
            shutil.rmtree(proc_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", proc_dir, e)
            raise
    if os.path.exists(concepts_dir):
        try:
            shutil.rmtree(concepts_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", concepts_dir, e)
            raise
    # ── Step 3: Hash dedupe ───────────────────────────────────────
    conn = db._get_conn()
    existing = conn.execute(
        "SELECT hash FROM catalogue WHERE hash = ?", (file_hash,)
    ).fetchone()
    if existing:
        logger.info("Duplicate hash %s, removing pair", file_hash[:8])
        try:
            os.remove(content_path)
            if meta_path:
                os.remove(meta_path)
        except OSError as e:
            logger.warning("Failed to remove duplicate pair: %s", e)
        result['action'] = 'duplicate'
        return result
    # ── Step 4: Size check ────────────────────────────────────────
    proc_cfg = config.get('processing', {})
    max_size_mb = proc_cfg.get('max_pdf_size_mb', 200)
    try:
        file_size_mb = os.path.getsize(content_path) / 1048576
    except OSError as e:
        result['error'] = "Cannot stat PDF: {}".format(e)
        return result
    if file_size_mb > max_size_mb:
        result['error'] = "PDF too large: {:.0f}MB > {}MB limit".format(
            file_size_mb, max_size_mb
        )
        result['action'] = 'content_failure'
        _move_to_rejected(content_path, meta_path, config, filename)
        return result
    # ── Step 5: Open PDF ──────────────────────────────────────────
    reader = None
    page_count = 0
    try:
        reader = PdfReader(content_path)
        page_count = len(reader.pages)
    except Exception as e:
        logger.warning(
            "PdfReader failed for %s: %s — trying pdfinfo", filename, e
        )
        page_count = _pdfinfo_page_count(content_path)
    if page_count == 0:
        logger.error("Content failure: cannot read PDF %s", filename)
        result['action'] = 'content_failure'
        result['error'] = "Cannot read PDF (0 pages)"
        _move_to_rejected(content_path, meta_path, config, filename)
        return result
    # ── Step 6: Source A — PDF dict metadata ──────────────────────
    source_a = _extract_pdf_dict(reader) if reader else {
        'title': None, 'author': None, 'edition': None, 'year': None
    }
    # ── Step 7: Source B — Filename metadata ──────────────────────
    source_b = _extract_filename_metadata(filename)
    # ── Step 8: Extract first 3 pages for Source C ────────────────
    page_timeout = proc_cfg.get('page_timeout', 30)
    first_pages_text = ""
    if reader:
        for i in range(min(3, page_count)):
            try:
                text, _method = extract_text_from_page(
                    reader, i, content_path, page_timeout
                )
                first_pages_text += text + "\n"
            except Exception as e:
                logger.warning(
                    "Failed to extract page %d for metadata: %s", i + 1, e
                )
    # ── Step 9: Source C — Gemini metadata ────────────────────────
    source_c = _extract_gemini_metadata(first_pages_text, config)
    # ── Step 10: Vote ─────────────────────────────────────────────
    voted, provenance = _vote_metadata(source_a, source_b, source_c)
    provenance_record = {
        'voted': voted,
        'provenance': provenance,
        'sources': {
            'pdf_dict': source_a,
            'filename': source_b,
            'gemini': source_c,
        }
    }
    # ── Step 11: Level-4 dedupe ───────────────────────────────────
    is_dup, dup_hash = _check_level4_dedupe(voted, db)
    if is_dup:
        logger.info(
            "Level-4 duplicate: %s matches %s", file_hash[:8], dup_hash[:8]
        )
        quarantine_dir = os.path.join(
            config.get('library_root', '/mnt/library'),
            '_review', 'duplicate_quarantine'
        )
        try:
            os.makedirs(quarantine_dir, exist_ok=True)
            quarantine_path = os.path.join(quarantine_dir, filename)
            shutil.move(content_path, quarantine_path)
            if meta_path:
                os.remove(meta_path)
            san_name = sanitize_filename(filename, doc_hash=file_hash)
            db.queue_duplicate_review(
                doc_hash=file_hash,
                original_filename=filename,
                sanitized_filename=san_name,
                collision_with_hash=dup_hash,
                duplicate_path=quarantine_path,
                book_title=voted.get('title'),
                book_author=voted.get('author'),
            )
        except Exception as e:
            logger.error("Failed to quarantine duplicate: %s", e)
        result['action'] = 'level4_duplicate'
        return result
    # ── Step 12: Set up processing directory ──────────────────────
    try:
        os.makedirs(proc_dir, exist_ok=True)
    except Exception as e:
        result['error'] = "Cannot create processing dir: {}".format(e)
        return result
    pdf_proc_path = os.path.join(proc_dir, 'source.pdf')
    try:
        shutil.move(content_path, pdf_proc_path)
        if meta_path:
            shutil.move(meta_path, os.path.join(proc_dir, 'sidecar.meta.json'))
    except Exception as e:
        result['error'] = "Cannot move PDF to processing: {}".format(e)
        return result
    # ── Step 13: Full text extraction ─────────────────────────────
    # Re-open reader from new location
    try:
        reader = PdfReader(pdf_proc_path)
    except Exception:
        reader = None
    pages_extracted = 0
    ocr_pages = []
    ocr_methods = {
        'pypdf2': 0, 'pdftotext': 0, 'tesseract': 0,
        'gemini_vision': 0, 'none': 0,
    }
    for i in range(page_count):
        try:
            if reader:
                text, method = extract_text_from_page(
                    reader, i, pdf_proc_path, page_timeout
                )
            else:
                proc_result = subprocess.run(
                    ['pdftotext', '-f', str(i + 1), '-l', str(i + 1),
                     pdf_proc_path, '-'],
                    capture_output=True, text=True, timeout=page_timeout
                )
                text = proc_result.stdout if proc_result.returncode == 0 else ''
                method = 'pdftotext' if text.strip() else 'none'
            ocr_methods[method] += 1
            if method in ('tesseract', 'gemini_vision'):
                ocr_pages.append(i + 1)
        except Exception as e:
            logger.warning("Page %d/%d failed: %s", i + 1, page_count, e)
            text = ''
            ocr_methods['none'] += 1
        page_path = os.path.join(proc_dir, "page_{:04d}.txt".format(i + 1))
        with open(page_path, 'w', encoding='utf-8') as f:
            f.write(text)
        if text.strip():
            pages_extracted += 1
        if (i + 1) % 50 == 0:
            logger.info("  %s: page %d/%d", filename, i + 1, page_count)
    # ── Step 14: Write meta.json ──────────────────────────────────
    meta = {
        'hash': file_hash,
        'filename': filename,
        'source_type': 'pdf',
        'page_count': page_count,
        'pages_extracted': pages_extracted,
        'ocr_pages': ocr_pages,
        'ocr_methods': ocr_methods,
        'metadata': voted,
        'metadata_provenance': provenance_record,
    }
    # Merge sidecar meta if present
    sidecar_path = os.path.join(proc_dir, 'sidecar.meta.json')
    if os.path.exists(sidecar_path):
        try:
            with open(sidecar_path, encoding='utf-8') as f:
                sidecar = json.load(f)
            meta['sidecar'] = sidecar
        except Exception:
            pass
    with open(os.path.join(proc_dir, 'meta.json'), 'w', encoding='utf-8') as f:
        json.dump(meta, f, indent=2)
    # ── Step 15: Register in catalogue + documents ────────────────
    display_title = voted.get('title') or clean_filename_to_title(filename)
    size_bytes = os.path.getsize(pdf_proc_path)
    source = 'acquired/pdf'
    category = 'PDF'
    if meta.get('sidecar'):
        source = meta['sidecar'].get('source', source)
        category = meta['sidecar'].get('category', category)
    db.add_to_catalogue(
        file_hash, filename, pdf_proc_path, size_bytes, source, category
    )
    db.queue_document(file_hash)
    # ── Step 16: Update documents row ─────────────────────────────
    conn = db._get_conn()
    conn.execute(
        "UPDATE documents SET "
        "text_dir = ?, page_count = ?, book_title = ?, book_author = ?, "
        "metadata_provenance = ? "
        "WHERE hash = ?",
        (
            proc_dir,
            page_count,
            voted.get('title'),
            voted.get('author'),
            json.dumps(provenance_record),
            file_hash,
        )
    )
    conn.commit()
    # ── Step 17: Status = extracted ───────────────────────────────
    db.update_status(file_hash, 'extracted', pages_extracted=pages_extracted)
    logger.info(
        "PDF pre_flight complete: %s (%s) -> %d/%d pages in %s",
        file_hash[:8], display_title, pages_extracted, page_count, proc_dir,
    )
    result['action'] = 'extracted'
    return result
 # ── Helpers ──────────────────────────────────────────────────────────
 def _move_to_rejected(content_path, meta_path, config, filename):
    """Move an unreadable PDF to _review/rejected_pdfs/."""
    review_dir = os.path.join(
        config.get('library_root', '/mnt/library'),
        '_review', 'rejected_pdfs'
    )
    try:
        os.makedirs(review_dir, exist_ok=True)
        reject_path = os.path.join(review_dir, filename)
        shutil.move(content_path, reject_path)
        if meta_path:
            os.remove(meta_path)
        logger.info("Moved rejected PDF to %s", reject_path)
    except Exception as e:
        logger.error("Failed to move rejected PDF: %s", e)
--- a/lib/processors/text_processor.py
+++ b/lib/processors/text_processor.py
@ -0,0 +1,320 @@
 """
 RECON Text Processor
 Handles pre_flight for plain .txt files arriving in acquired/text/.
 These are primary source documents (books, manuals, guides), not derived
 content like transcripts.
 Metadata extraction via two-source vote:
  A. Filename parsing (title, optionally author)
  B. Gemini LLM extraction (title/author/edition/year from first 3 pages)
 Filing behavior matches PDFs: files get organized to library/Domain/Subdomain/
 by the filing worker (NOT organized in-place like transcripts).
 Phase 6f: initial implementation.
 """
 import json
 import logging
 import os
 import re
 import shutil
 from lib.web_scraper import chunk_text
 from lib.utils import content_hash, clean_filename_to_title
 from lib.processors.pdf_processor import _extract_gemini_metadata
 logger = logging.getLogger("recon.processors.text")
 WORDS_PER_PAGE = 2000
 def _extract_filename_metadata(filename):
    """Source A: Extract metadata by parsing the filename.
    Returns dict with keys: title, author, edition, year (any may be None).
    """
    result = {'title': None, 'author': None, 'edition': None, 'year': None}
    stem = os.path.splitext(filename)[0]
    result['title'] = clean_filename_to_title(filename)
    # Year: look for (YYYY) or [YYYY] or _YYYY_ or standalone YYYY
    year_match = re.search(r'[\(\[_\s]((?:19|20)\d{2})[\)\]_\s.]', stem)
    if year_match:
        result['year'] = year_match.group(1)
    # Edition: Nth Edition, Edition N, etc.
    ed_match = re.search(
        r'(\d+)(?:st|nd|rd|th)?\s*(?:edition|ed\.?)',
        stem, re.IGNORECASE
    )
    if ed_match:
        result['edition'] = ed_match.group(0).strip()
    # Author: "Title - Author" or "Title by Author" patterns
    # Try " - Author" at end
    dash_match = re.search(r'\s+-\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)$', stem)
    if dash_match:
        result['author'] = dash_match.group(1).strip()
    else:
        by_match = re.search(r'\bby\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)', stem)
        if by_match:
            result['author'] = by_match.group(1).strip()
    return result
 def _vote_metadata(source_a, source_b):
    """Vote on each metadata field across two sources.
    Priority: B (Gemini) > A (filename).
    If both agree, note agreement.
    Returns:
        (voted_dict, provenance_record)
    """
    sources = {'filename': source_a, 'gemini': source_b}
    priority = ['gemini', 'filename']
    voted = {}
    provenance = {}
    for field in ('title', 'author', 'edition', 'year'):
        values = {}
        for name, src in sources.items():
            val = src.get(field)
            if val and str(val).strip().lower() != 'null':
                values[name] = val
        if not values:
            voted[field] = None
            provenance[field] = None
            continue
        def _norm(v):
            if field == 'year':
                return v.strip()
            return v.strip().lower()
        norm_groups = {}
        for name, val in values.items():
            nv = _norm(val)
            norm_groups.setdefault(nv, []).append((name, val))
        best_group = max(norm_groups.values(), key=len)
        if len(best_group) >= 2:
            # Both sources agree
            for p in priority:
                for name, val in best_group:
                    if name == p:
                        voted[field] = val
                        names = ','.join(n for n, _ in best_group)
                        provenance[field] = "agreed({})".format(names)
                        break
                if voted.get(field) is not None:
                    break
        else:
            # No agreement — highest priority wins
            for p in priority:
                if p in values:
                    voted[field] = values[p]
                    provenance[field] = p
                    break
    provenance_record = {
        'voted': voted,
        'provenance': provenance,
        'sources': {
            'filename': source_a,
            'gemini': source_b,
        }
    }
    return voted, provenance_record
 def pre_flight(content_path, meta_path, db, config):
    """Process a .txt file dropped into acquired/text/.
    Args:
        content_path: Path to the .txt file
        meta_path: Path to .meta.json sidecar (may be None)
        db: StatusDB instance
        config: RECON config dict
    Returns:
        dict with keys: hash, action, source_path, error
        Actions: 'extracted', 'duplicate', 'skip_empty', 'error'
    """
    result = {
        'hash': None,
        'action': 'error',
        'source_path': content_path,
        'error': None,
    }
    filename = os.path.basename(content_path)
    # ── Step 1: Hash ──────────────────────────────────────────────
    try:
        file_hash = content_hash(content_path)
        result['hash'] = file_hash
    except Exception as e:
        result['error'] = "Cannot hash text file: {}".format(e)
        return result
    # ── Step 2: Stale state cleanup ───────────────────────────────
    processing_root = config.get('pipeline', {}).get(
        'processing_root', '/opt/recon/data/processing'
    )
    proc_dir = os.path.join(processing_root, file_hash)
    concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
    if os.path.exists(proc_dir):
        try:
            shutil.rmtree(proc_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", proc_dir, e)
            raise
    if os.path.exists(concepts_dir):
        try:
            shutil.rmtree(concepts_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", concepts_dir, e)
            raise
    # ── Step 3: Hash dedupe ───────────────────────────────────────
    conn = db._get_conn()
    existing = conn.execute(
        "SELECT hash FROM catalogue WHERE hash = ?", (file_hash,)
    ).fetchone()
    if existing:
        logger.info("Duplicate hash %s, removing pair", file_hash[:8])
        try:
            os.remove(content_path)
            if meta_path:
                os.remove(meta_path)
        except OSError as e:
            logger.warning("Failed to remove duplicate pair: %s", e)
        result['action'] = 'duplicate'
        return result
    # ── Step 4: Read text content ─────────────────────────────────
    try:
        with open(content_path, encoding='utf-8', errors='replace') as f:
            raw_text = f.read()
    except Exception as e:
        result['error'] = "Cannot read text file: {}".format(e)
        return result
    if len(raw_text.strip()) < 50:
        logger.info("Text file too short (%d chars), skipping: %s",
                     len(raw_text.strip()), filename)
        try:
            os.remove(content_path)
            if meta_path:
                os.remove(meta_path)
        except OSError:
            pass
        result['action'] = 'skip_empty'
        return result
    # ── Step 5: Read optional sidecar ─────────────────────────────
    sidecar = None
    if meta_path:
        try:
            with open(meta_path, encoding='utf-8') as f:
                sidecar = json.load(f)
        except Exception as e:
            logger.warning("Cannot read sidecar %s: %s", meta_path, e)
    # ── Step 6: Set up processing directory ───────────────────────
    try:
        os.makedirs(proc_dir, exist_ok=True)
    except Exception as e:
        result['error'] = "Cannot create processing dir: {}".format(e)
        return result
    # ── Step 7: Move files to processing ──────────────────────────
    source_path = os.path.join(proc_dir, 'source.txt')
    try:
        shutil.move(content_path, source_path)
        if meta_path:
            shutil.move(meta_path, os.path.join(proc_dir, 'meta.json'))
    except Exception as e:
        result['error'] = "Cannot move files to processing: {}".format(e)
        return result
    # ── Step 8: Split into pages ──────────────────────────────────
    pages = chunk_text(raw_text, WORDS_PER_PAGE)
    for i, page_text in enumerate(pages, start=1):
        page_path = os.path.join(proc_dir, "page_{:04d}.txt".format(i))
        with open(page_path, 'w', encoding='utf-8') as f:
            f.write(page_text)
    # ── Step 9: Source A — filename metadata ──────────────────────
    source_a = _extract_filename_metadata(filename)
    # ── Step 10: Source B — Gemini metadata ───────────────────────
    first_pages_text = "\n".join(pages[:3])
    source_b = _extract_gemini_metadata(first_pages_text, config)
    # ── Step 11: Vote ─────────────────────────────────────────────
    voted, provenance_record = _vote_metadata(source_a, source_b)
    # ── Step 12: Write meta.json ──────────────────────────────────
    meta = {
        'hash': file_hash,
        'filename': filename,
        'source_type': 'text',
        'page_count': len(pages),
        'text_length': len(raw_text),
        'metadata': voted,
        'metadata_provenance': provenance_record,
    }
    if sidecar:
        meta['sidecar'] = sidecar
    with open(os.path.join(proc_dir, 'meta.json'), 'w', encoding='utf-8') as f:
        json.dump(meta, f, indent=2)
    # ── Step 13: Register in catalogue ────────────────────────────
    display_title = voted.get('title') or clean_filename_to_title(filename)
    size_bytes = os.path.getsize(source_path)
    db.add_to_catalogue(
        file_hash, filename, source_path, size_bytes, 'text', 'Document'
    )
    # ── Step 14: Queue and update documents row ───────────────────
    db.queue_document(file_hash)
    conn = db._get_conn()
    conn.execute(
        "UPDATE documents SET "
        "text_dir = ?, page_count = ?, path = ?, "
        "book_title = ?, book_author = ?, metadata_provenance = ? "
        "WHERE hash = ?",
        (
            proc_dir,
            len(pages),
            source_path,
            voted.get('title'),
            voted.get('author'),
            json.dumps(provenance_record),
            file_hash,
        )
    )
    conn.commit()
    # ── Step 15: Status = extracted ───────────────────────────────
    db.update_status(file_hash, 'extracted', pages_extracted=len(pages))
    logger.info(
        "Text pre_flight complete: %s (%s) -> %d pages in %s",
        file_hash[:8], display_title, len(pages), proc_dir,
    )
    result['action'] = 'extracted'
    return result
--- a/lib/processors/transcript_processor.py
+++ b/lib/processors/transcript_processor.py
@ -0,0 +1,175 @@
 """
 RECON Transcript Processor
 Handles pre_flight for transcript content arriving in acquired/stream/.
 Reads a raw text file + meta.json sidecar, hashes, dedupes, splits into
 page_NNNN.txt files, and registers in the database.
 Phase 3: first processor implementation.
 Phase 4: added stale state cleanup at start of pre_flight.
 """
 import hashlib
 import json
 import logging
 import os
 import shutil
 from lib.web_scraper import chunk_text
 from lib.utils import content_hash
 logger = logging.getLogger("recon.processors.transcript")
 # Words per page for transcript chunking (matches existing pipeline)
 WORDS_PER_PAGE = 2000
 def pre_flight(content_path, meta_path, db, config):
    """Process a transcript pair from acquired/stream/.
    Args:
        content_path: Path to the raw transcript .txt file
        meta_path: Path to the .meta.json sidecar
        db: StatusDB instance
        config: RECON config dict
    Returns:
        dict with keys: hash, action, source_path, error
        Actions: 'extracted', 'duplicate', 'error'
    """
    result = {
        'hash': None,
        'action': 'error',
        'source_path': content_path,
        'error': None,
    }
    # Read and hash the content file
    try:
        file_hash = content_hash(content_path)
        result['hash'] = file_hash
    except Exception as e:
        result['error'] = f"Cannot hash content file: {e}"
        return result
    # Stale state cleanup — remove any pre-existing processing/concepts dirs
    processing_root = config.get('pipeline', {}).get(
        'processing_root', '/opt/recon/data/processing'
    )
    proc_dir = os.path.join(processing_root, file_hash)
    concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
    if os.path.exists(proc_dir):
        try:
            shutil.rmtree(proc_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", proc_dir, e)
            raise
    if os.path.exists(concepts_dir):
        try:
            shutil.rmtree(concepts_dir)
        except Exception as e:
            logger.error("Stale cleanup failed for %s: %s", concepts_dir, e)
            raise
    # Hash dedupe: if hash exists in catalogue, delete the pair and return
    conn = db._get_conn()
    existing = conn.execute(
        "SELECT hash FROM catalogue WHERE hash = ?", (file_hash,)
    ).fetchone()
    if existing:
        logger.info("Duplicate hash %s, removing pair", file_hash[:8])
        try:
            os.remove(content_path)
            if meta_path:
                os.remove(meta_path)
        except OSError as e:
            logger.warning("Failed to remove duplicate pair: %s", e)
        result['action'] = 'duplicate'
        return result
    # Read meta.json sidecar
    try:
        with open(meta_path, encoding='utf-8') as f:
            meta = json.load(f)
    except Exception as e:
        result['error'] = f"Cannot read meta.json: {e}"
        return result
    # Read raw transcript text
    try:
        with open(content_path, encoding='utf-8') as f:
            raw_text = f.read()
    except Exception as e:
        result['error'] = f"Cannot read content file: {e}"
        return result
    if not raw_text.strip():
        result['error'] = "Empty transcript"
        return result
    # Set up processing directory
    try:
        os.makedirs(proc_dir, exist_ok=True)
    except Exception as e:
        result['error'] = f"Cannot create processing dir: {e}"
        return result
    # Move the pair to processing/{hash}/
    try:
        shutil.move(content_path, os.path.join(proc_dir, 'transcript.txt'))
        shutil.move(meta_path, os.path.join(proc_dir, 'meta.json'))
    except Exception as e:
        result['error'] = f"Cannot move pair to processing: {e}"
        return result
    # Split raw text into page_NNNN.txt files
    pages = chunk_text(raw_text, WORDS_PER_PAGE)
    for i, page_text in enumerate(pages, start=1):
        page_path = os.path.join(proc_dir, f"page_{i:04d}.txt")
        with open(page_path, 'w', encoding='utf-8') as f:
            f.write(page_text)
    # Update meta.json with processing metadata
    meta['hash'] = file_hash
    meta['source_type'] = meta.get('source_type', 'transcript')
    meta['page_count'] = len(pages)
    meta['text_length'] = len(raw_text)
    meta_out_path = os.path.join(proc_dir, 'meta.json')
    with open(meta_out_path, 'w', encoding='utf-8') as f:
        json.dump(meta, f, indent=2)
    # Register in catalogue
    title = meta.get('title', os.path.basename(content_path))
    source_url = meta.get('source_url', meta.get('url', ''))
    source = 'stream.echo6.co'
    category = meta.get('category', 'Transcript')
    size_bytes = os.path.getsize(os.path.join(proc_dir, 'transcript.txt'))
    db.add_to_catalogue(file_hash, title, source_url, size_bytes, source, category)
    # Queue and advance to extracted
    db.queue_document(file_hash)
    # Set text_dir and page_count on the documents row.
    # Transcripts are derived text from PeerTube videos, not primary sources.
    # They don't get filed into library/Domain/Subdomain/ like PDFs -- instead,
    # they're marked organized in-place. Their watch URL remains in catalogue.path
    # and Qdrant download_url so users clicking search results go to PeerTube.
    # The filing worker's path LIKE filter naturally excludes transcripts since
    # their documents.path is the watch URL, not a filesystem path.
    conn = db._get_conn()
    conn.execute(
        "UPDATE documents SET text_dir = ?, page_count = ?, organized_at = CURRENT_TIMESTAMP WHERE hash = ?",
        (proc_dir, len(pages), file_hash)
    )
    conn.commit()
    # Update status to extracted with page count
    db.update_status(file_hash, 'extracted', pages_extracted=len(pages))
    logger.info(
        "Transcript pre_flight complete: %s (%s) -> %d pages in %s",
        file_hash[:8], title, len(pages), proc_dir,
    )
    result['action'] = 'extracted'
    return result
--- a/lib/utils.py
+++ b/lib/utils.py
@ -388,3 +388,19 @@ def generate_download_url(filepath, library_root='/mnt/library', base_url='https
    parts = rel.split(os.sep)
    encoded = '/'.join(quote(p) for p in parts)
    return f"{base_url}/{encoded}"
 def resolve_text_dir(file_hash, config, db=None):
    """Resolve the text directory for a document.
    If db is provided and documents.text_dir is set for this hash, use that.
    Otherwise fall back to the legacy location: config['paths']['text']/{hash}/
    """
    if db is not None:
        conn = db._get_conn()
        row = conn.execute(
            "SELECT text_dir FROM documents WHERE hash = ?", (file_hash,)
        ).fetchone()
        if row and row['text_dir']:
            return row['text_dir']
    return os.path.join(config['paths']['text'], file_hash)
--- a/recon.py
+++ b/recon.py
@ -3,7 +3,7 @@
 RECON CLI — Main entry point.
 Subcommands: scan, queue, extract, enrich, embed, run, search, upload,
-ingest-url, crawl, ingest-peertube, organize, status, catalogue, failures, validate, rebuild, serve, ingest.
+ingest-url, ingest-peertube, organize, status, catalogue, failures, validate, rebuild, serve, ingest.
 Usage: cd /opt/recon && source venv/bin/activate && python3 recon.py <command>
 """
@ -580,73 +580,6 @@ def cmd_ingest_url(args):
 def cmd_crawl(args):
    from lib.crawler import crawl_site
    print(f"Crawling {args.url}...")
    if args.include:
        print(f"  Include paths: {args.include}")
    if args.exclude:
        print(f"  Exclude paths: {args.exclude}")
    if args.dry_run:
        print(f"  DRY RUN — no content will be ingested")
    print()
    result = crawl_site(
        base_url=args.url,
        category=args.category,
        source=args.source,
        include=args.include,
        exclude=args.exclude,
        max_pages=args.max_pages,
        max_depth=args.max_depth,
        delay=args.delay,
        dry_run=args.dry_run,
        use_sitemap=not args.no_sitemap,
    )
    method = result.get('discovery_method', 'none')
    print(f"Discovery method: {method}")
    if args.dry_run:
        urls = result.get('urls', [])
        print(f"Found {len(urls)} URLs that would be ingested:\n")
        for i, url in enumerate(urls, 1):
            print(f"  {i:4d}. {url}")
        print(f"\nTotal: {len(urls)} pages")
        print(f"Re-run without --dry-run to ingest.")
        return 0
    summary = result.get('summary', {})
    print(f"\nResults:")
    print(f"  New:        {summary.get('succeeded', 0)}")
    print(f"  Duplicates: {summary.get('duplicates', 0)}")
    print(f"  Failed:     {summary.get('failed', 0)}")
    print(f"  Total:      {summary.get('total', 0)}")
    failed_results = [r for r in result.get('results', []) if r.get('status') == 'failed']
    if failed_results:
        print(f"\nFailed URLs:")
        for r in failed_results[:10]:
            print(f"  {r['url']}: {r.get('error', 'Unknown error')}")
        if len(failed_results) > 10:
            print(f"  ... and {len(failed_results) - 10} more")
    if args.enrich or args.process:
        print("\nRunning enrichment...")
        from lib.enricher import run_enrichment
        enriched = run_enrichment()
        print(f"  Enriched: {enriched}")
    if args.process:
        print("\nRunning embedding...")
        from lib.embedder import run_embedding
        embedded = run_embedding()
        print(f"  Embedded: {embedded}")
    return 0
 def cmd_validate(args):
    from scripts.validate import run_validation
    run_validation(deep=args.deep)
@ -668,30 +601,31 @@ def cmd_serve(args):
 def cmd_service(args):
    """Run RECON as a long-lived service. Called by systemd.
-    Bundles: Flask dashboard + pipeline stages + library scanner + PeerTube scanner + progress reporter.
+    Bundles: Flask dashboard + dispatcher + pipeline stages + filing worker + progress reporter.
    All threads are daemon threads; SIGTERM/SIGINT trigger graceful shutdown.
    """
    from lib.extractor import run_extraction
    from lib.enricher import run_enrichment
    from lib.embedder import run_embedding
    from lib.api import app, run_server as start_dashboard
-    from lib.peertube_scraper import ingest_all as pt_ingest_all
+    from lib.dispatcher import dispatch_loop
    from lib.filing import filing_worker_loop
    from lib.acquisition.peertube import acquisition_loop
    config = get_config()
    proc = config.get('processing', {})
    svc = config.get('service', {})
    extract_workers = proc.get('extract_workers', 4)
    enrich_workers = proc.get('enrich_workers', 16)
    embed_workers = proc.get('embed_workers', 4)
    poll_interval = svc.get('stage_poll_interval', 30)
-    scan_interval = svc.get('scan_interval', 3600)
+    dispatch_interval = svc.get('dispatch_interval', 30)
    filing_interval = svc.get('filing_interval', 30)
    progress_interval = svc.get('progress_interval', 60)
    web_host = config.get('web', {}).get('host', '0.0.0.0')
    web_port = config.get('web', {}).get('port', 8420)
    stop_event = threading.Event()
-    totals = {'extract': 0, 'enrich': 0, 'embed': 0, 'scan': 0}
+    totals = {'enrich': 0, 'embed': 0}
    def shutdown(signum, frame):
        sig_name = signal.Signals(signum).name
@ -716,36 +650,6 @@ def cmd_service(args):
            stop_event.wait(poll_interval)
        logger.info(f"[{name}] Stage stopped (total: {totals[name]})")
    def scanner_loop():
        """Periodically scan library and queue new documents."""
        logger.info(f"[scanner] Started (interval: {scan_interval}s)")
        # Run initial scan immediately
        try:
            new_cat = scan_library()
            new_queued = queue_all()
            synced = sync_qdrant_paths()
            if new_cat or new_queued or synced:
                logger.info(f"[scanner] Initial: {new_cat} catalogued, {new_queued} queued, {synced} paths synced")
        except Exception as e:
            logger.error(f"[scanner] Initial scan error: {e}", exc_info=True)
        while not stop_event.is_set():
            stop_event.wait(scan_interval)
            if stop_event.is_set():
                break
            try:
                new_cat = scan_library()
                new_queued = queue_all()
                synced = sync_qdrant_paths()
                totals['scan'] += new_queued
                if new_cat or new_queued or synced:
                    logger.info(f"[scanner] {new_cat} catalogued, {new_queued} queued, {synced} paths synced")
                else:
                    logger.debug("[scanner] No new documents")
            except Exception as e:
                logger.error(f"[scanner] Error: {e}", exc_info=True)
        logger.info("[scanner] Stopped")
    def progress_loop():
        """Log pipeline status periodically."""
        while not stop_event.is_set():
@ -769,217 +673,20 @@ def cmd_service(args):
            except Exception:
                pass
-    def peertube_scanner_loop():
+    db = StatusDB()
        """Periodically ingest new PeerTube video transcripts."""
        from lib.peertube_scraper import set_stop_check
        set_stop_check(stop_event.is_set)
        logger.info(f"[peertube] Scanner started (interval: {scan_interval}s)")
        try:
            result = pt_ingest_all(config=config)
            ingested = result.get('ingested', 0) if isinstance(result, dict) else 0
            if ingested:
                logger.info(f"[peertube] Initial scan: {ingested} transcripts ingested")
            else:
                logger.info("[peertube] Initial scan: no new transcripts")
        except Exception as e:
            logger.error(f"[peertube] Initial scan error: {e}", exc_info=True)
        while not stop_event.is_set():
            stop_event.wait(scan_interval)
            if stop_event.is_set():
                break
            try:
                result = pt_ingest_all(config=config)
                ingested = result.get('ingested', 0) if isinstance(result, dict) else 0
                if ingested:
                    logger.info(f"[peertube] {ingested} new transcripts ingested")
                else:
                    logger.debug("[peertube] No new transcripts")
            except Exception as e:
                logger.error(f"[peertube] Scan error: {e}", exc_info=True)
        logger.info("[peertube] Scanner stopped")
    def crawler_scheduler_loop():
        """Scheduled site crawler — crawls configured sites, one per cycle.
        - Reads sites from config.yaml crawler.sites list
        - Processes sites in tier order (lower tier = higher priority)
        - Tracks last-crawled timestamp per site in crawl_state.json
        - Skips sites crawled within recrawl_interval_days (default 7)
        - Waits inter_site_cooldown seconds between sites (default 30)
        - Uses per-site delay for rate limiting during crawl
        """
        import json as _json
        from lib.crawler import crawl_site
        crawler_cfg = config.get('crawler', {})
        sites = crawler_cfg.get('sites', [])
        if not sites:
            logger.info("[crawler] No sites configured, scheduler disabled")
            return
        recrawl_days = crawler_cfg.get('recrawl_interval_days', 7)
        cooldown = crawler_cfg.get('inter_site_cooldown', 30)
        state_file = os.path.join(config.get('paths', {}).get('data', '/opt/recon/data'), 'crawl_state.json')
        # Load persisted state
        def load_state():
            try:
                with open(state_file, 'r') as sf:
                    return _json.load(sf)
            except (FileNotFoundError, _json.JSONDecodeError):
                return {}
        def save_state(state):
            try:
                with open(state_file, 'w') as sf:
                    _json.dump(state, sf, indent=2)
            except Exception as e:
                logger.error(f"[crawler] Failed to save state: {e}")
        # Sort by tier (lower = higher priority)
        sorted_sites = sorted(sites, key=lambda s: s.get('tier', 99))
        logger.info(f"[crawler] Scheduler started — {len(sorted_sites)} sites configured, "
                    f"recrawl every {recrawl_days}d, {cooldown}s cooldown between sites")
        # Initial delay — let the main pipeline stabilize before crawling
        stop_event.wait(60)
        if stop_event.is_set():
            return
        while not stop_event.is_set():
            state = load_state()
            now = time.time()
            crawled_this_cycle = 0
            for site in sorted_sites:
                if stop_event.is_set():
                    break
                url = site.get('url', '')
                if not url:
                    continue
                # Check recrawl interval
                last_crawled = state.get(url, {}).get('last_crawled', 0)
                age_days = (now - last_crawled) / 86400
                if age_days < recrawl_days:
                    logger.debug(f"[crawler] Skipping {url} — crawled {age_days:.1f}d ago")
                    continue
                category = site.get('category', 'Web')
                max_depth = site.get('max_depth', crawler_cfg.get('max_depth', 3))
                max_pages = site.get('max_pages', crawler_cfg.get('max_pages', 500))
                delay = site.get('delay', crawler_cfg.get('rate_limit_delay', 1.0))
                tier = site.get('tier', 99)
                notes = site.get('notes', '')
                logger.info(f"[crawler] Starting: {url} (tier {tier}, category={category}, "
                            f"depth={max_depth}, pages={max_pages}, delay={delay}s)")
                try:
                    result = crawl_site(
                        base_url=url,
                        category=category,
                        max_pages=max_pages,
                        max_depth=max_depth,
                        delay=delay,
                        config=config,
                    )
                    summary = result.get('summary', {})
                    method = result.get('discovery_method', 'none')
                    logger.info(f"[crawler] Done: {url} via {method} — "
                                f"{summary.get('succeeded', 0)} new, "
                                f"{summary.get('duplicates', 0)} dupes, "
                                f"{summary.get('failed', 0)} failed")
                    # Update state
                    state[url] = {
                        'last_crawled': time.time(),
                        'method': method,
                        'succeeded': summary.get('succeeded', 0),
                        'duplicates': summary.get('duplicates', 0),
                        'failed': summary.get('failed', 0),
                        'tier': tier,
                        'category': category,
                    }
                    save_state(state)
                    crawled_this_cycle += 1
                except Exception as e:
                    logger.error(f"[crawler] Failed: {url} — {e}", exc_info=True)
                    state[url] = {
                        'last_crawled': time.time(),
                        'error': str(e),
                        'tier': tier,
                        'category': category,
                    }
                    save_state(state)
                # Inter-site cooldown
                if not stop_event.is_set():
                    logger.debug(f"[crawler] Cooling down {cooldown}s before next site...")
                    stop_event.wait(cooldown)
            if crawled_this_cycle:
                logger.info(f"[crawler] Cycle complete — {crawled_this_cycle} sites crawled")
            else:
                logger.debug("[crawler] Cycle complete — all sites within recrawl window")
            # Sleep until next cycle check (1 hour)
            if not stop_event.is_set():
                stop_event.wait(3600)
        logger.info("[crawler] Scheduler stopped")
    def organizer_loop():
        """Organize completed documents into Domain/Subdomain folders."""
        from lib.organizer import organize_document
        logger.info(f"[organizer] Started (interval: {poll_interval}s)")
        # Initial delay — let pipeline process some docs first
        stop_event.wait(60)
        while not stop_event.is_set():
            try:
                org_db = StatusDB()
                unorganized = org_db.get_unorganized(limit=100)
                if unorganized:
                    moved = 0
                    errors = 0
                    for doc in unorganized:
                        if stop_event.is_set():
                            break
                        result = organize_document(doc['hash'], org_db, config)
                        if result['action'] == 'moved':
                            moved += 1
                        elif result['action'] == 'error':
                            errors += 1
                    if moved or errors:
                        logger.info(f"[organizer] Organized {moved} documents ({errors} errors)")
                    # Sync paths to Qdrant after batch
                    if moved > 0:
                        synced = sync_qdrant_paths()
                        if synced:
                            logger.info(f"[organizer] Synced {synced} paths to Qdrant")
                else:
                    logger.debug("[organizer] No unorganized documents")
            except Exception as e:
                logger.error(f"[organizer] Error: {e}", exc_info=True)
            stop_event.wait(poll_interval)
        logger.info("[organizer] Stopped")
    threads = [
-        threading.Thread(target=stage_loop, daemon=True, name='extract',
+        threading.Thread(target=lambda: dispatch_loop(stop_event, db, config, interval=dispatch_interval),
-                         args=('extract', lambda: run_extraction(workers=extract_workers))),
+                         daemon=True, name='dispatcher'),
        threading.Thread(target=stage_loop, daemon=True, name='enrich',
                         args=('enrich', lambda: run_enrichment(workers=enrich_workers))),
        threading.Thread(target=stage_loop, daemon=True, name='embed',
                         args=('embed', lambda: run_embedding(workers=embed_workers))),
-        threading.Thread(target=scanner_loop, daemon=True, name='scanner'),
+        threading.Thread(target=lambda: filing_worker_loop(stop_event, db, config, interval=filing_interval),
-        threading.Thread(target=peertube_scanner_loop, daemon=True, name='peertube'),
+                         daemon=True, name='filing'),
-        threading.Thread(target=crawler_scheduler_loop, daemon=True, name='crawler'),
+        threading.Thread(target=lambda: acquisition_loop(stop_event, db, config,
-        threading.Thread(target=organizer_loop, daemon=True, name='organizer'),
+                                                       interval=config.get("peertube", {}).get("poll_interval", 1800)),
                         daemon=True, name="peertube-acq"),
        threading.Thread(target=progress_loop, daemon=True, name='progress'),
        threading.Thread(target=lambda: start_dashboard(stop_event),
                         daemon=True, name='dashboard'),
@ -987,10 +694,11 @@ def cmd_service(args):
    logger.info("=== RECON Service Starting ===")
    logger.info(f"  Dashboard: {web_host}:{web_port}")
-    logger.info(f"  Workers: extract={extract_workers}, enrich={enrich_workers}, embed={embed_workers}")
+    logger.info(f"  Workers: enrich={enrich_workers}, embed={embed_workers}")
-    logger.info(f"  Scanner: every {scan_interval}s | Progress: every {progress_interval}s")
+    logger.info(f"  Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
-    crawler_sites = config.get('crawler', {}).get('sites', [])
+    pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
-    logger.info(f"  Crawler: {len(crawler_sites)} sites configured")
+    logger.info(f"  PeerTube acquisition: every {pt_interval}s")
    logger.info(f"  Progress: every {progress_interval}s")
    for t in threads:
        t.start()
@ -1018,10 +726,10 @@ def cmd_service(args):
    logger.info("=== RECON Service Stopped ===")
    return 0
 def cmd_ingest_peertube(args):
-    from lib.peertube_scraper import ingest_channel, ingest_all, get_instance_stats
+    from lib.peertube_scraper import get_instance_stats
    from lib.acquisition.peertube import acquire_batch
    from lib.status import StatusDB
    if args.stats:
        stats = get_instance_stats()
@ -1034,36 +742,20 @@ def cmd_ingest_peertube(args):
                print(f"    {status}: {count}")
        return 0
-    since = args.since if hasattr(args, 'since') and args.since else None
+    db = StatusDB()
-    if args.channel:
+    print("Acquiring PeerTube transcripts to hopper...")
-        print(f"Ingesting PeerTube channel: {args.channel}")
+    result = acquire_batch(db)
        result = ingest_channel(args.channel, since=since)
    else:
        print("Ingesting all PeerTube videos with captions")
        result = ingest_all(since=since)
    summary = result.get('summary', {})
    print(f"\nResults:")
-    print(f"  Checked:      {summary.get('total_checked', 0)}")
+    print(f"  Acquired:     {result['acquired']}")
-    print(f"  Ingested:     {summary.get('ingested', 0)} ({summary.get('total_pages', 0)} pages)")
+    print(f"  Skipped:      {result['skipped']}")
-    print(f"  No captions:  {summary.get('skipped_no_captions', 0)}")
+    print(f"  Errors:       {result['errors']}")
    print(f"  Duplicates:   {summary.get('skipped_duplicate', 0)}")
    print(f"  Failed:       {summary.get('failed', 0)}")
-    # Optional: run enrichment
+    if result['acquired']:
-    if args.enrich or args.process:
+        print(f"\n{result['acquired']} transcript(s) staged in hopper.")
-        print("\nRunning enrichment on extracted content...")
+        print("The dispatcher will pick them up on its next cycle.")
-        from lib.enricher import run_enrichment
+        print("Run 'recon status' to monitor progress.")
        enriched = run_enrichment()
        print(f"  Enriched: {enriched}")
    # Optional: run embedding too
    if args.process:
        print("\nRunning embedding...")
        from lib.embedder import run_embedding
        embedded = run_embedding()
        print(f"  Embedded: {embedded}")
    return 0
@ -1417,21 +1109,6 @@ def main():
    p.set_defaults(func=cmd_ingest_url)
    # crawl
    p = sub.add_parser('crawl', help='Crawl a site and ingest discovered pages')
    p.add_argument('url', help='Base URL to crawl')
    p.add_argument('--category', default='Web', help='Category for ingested content')
    p.add_argument('--source', default=None, help='Source identifier (default: domain name)')
    p.add_argument('--include', nargs='+', help='Only include URL paths starting with these prefixes')
    p.add_argument('--exclude', nargs='+', help='Exclude URL paths starting with these prefixes')
    p.add_argument('--max-pages', type=int, default=500, help='Maximum pages to ingest')
    p.add_argument('--max-depth', type=int, default=3, help='Maximum link-follow depth')
    p.add_argument('--delay', type=float, default=1.0, help='Delay between page fetches (seconds)')
    p.add_argument('--dry-run', action='store_true', help='Discover URLs without ingesting')
    p.add_argument('--no-sitemap', action='store_true', help='Skip sitemap, use link-following only')
    p.add_argument('--enrich', action='store_true', help='Run enrichment after crawl')
    p.add_argument('--process', action='store_true', help='Full pipeline: crawl + enrich + embed')
    p.set_defaults(func=cmd_crawl)
    # validate
    p = sub.add_parser('validate', help='Validate pipeline consistency')
    p.add_argument('--deep', action='store_true', help='Deep validation (check all files)')
@ -1459,12 +1136,8 @@ def main():
    p.set_defaults(func=cmd_organize)
    # ingest-peertube
-    p = sub.add_parser('ingest-peertube', help='Ingest PeerTube video transcripts')
+    p = sub.add_parser('ingest-peertube', help='Acquire PeerTube transcripts to hopper')
    p.add_argument('--channel', help='Ingest specific channel (actor_name)')
    p.add_argument('--since', help='Only ingest videos published after this date (ISO format)')
    p.add_argument('--stats', action='store_true', help='Show PeerTube instance stats')
    p.add_argument('--enrich', action='store_true', help='Run enrichment after ingestion')
    p.add_argument('--process', action='store_true', help='Full pipeline: ingest + enrich + embed')
    p.set_defaults(func=cmd_ingest_peertube)
    # ingest
--- a/static/css/recon.css
+++ b/static/css/recon.css
@ -210,6 +210,7 @@ tr:hover { background: var(--bg-secondary); }
 }
 .badge-web { background: #1e3a5f; color: #60a5fa; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
 .badge-pdf { background: #2d5a2d; color: #4ade80; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
 .badge-transcript { background: #3b1f5e; color: #c084fc; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
 /* ── Trend indicators ── */
 .trend { font-size: 11px; margin-left: 6px; }
--- a/static/js/dashboard.js
+++ b/static/js/dashboard.js
@ -88,7 +88,7 @@
                var pipeCount = s.in_pipeline || 0;
                totalCat += catCount; totalComp += compCount; totalPipe += pipeCount;
                totalConcepts += s.concepts; totalVectors += s.vectors;
-                var badge = s.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
+                var badge = s.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : s.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
                var compPct = catCount > 0 ? (compCount / catCount * 100) : 0;
                var pipePct = catCount > 0 ? (pipeCount / catCount * 100) : 0;
                var compColor = compPct >= 100 ? '#00ff41' : compPct > 0 ? '#ffa500' : '#666';
@ -185,7 +185,7 @@
                rtb.innerHTML = '<tr><td colspan="4" class="text-dim">None yet</td></tr>';
            } else {
                rtb.innerHTML = data.recent_complete.map(function(r) {
-                    var badge = r.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
+                    var badge = r.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : r.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
                    return '<tr><td>' + r.title + '</td><td>' + badge + '</td><td>' +
                        r.concepts + '</td><td>' + r.vectors + '</td></tr>';
                }).join('');
--- a/templates/knowledge/upload.html
+++ b/templates/knowledge/upload.html
@ -1,12 +1,13 @@
 {% extends "base.html" %}
 {% block content %}
-<h3 class="section-title mb-16">Upload PDF</h3>
+<h3 class="section-title mb-16">Upload Document</h3>
 <div class="panel">
    <form id="upload-form" enctype="multipart/form-data">
        <div class="mb-16">
-            <label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">PDF File</label>
+            <label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">Document File</label>
-            <input type="file" name="file" accept=".pdf" id="upload-file"
+            <input type="file" name="file" accept=".pdf,.txt,.epub,.doc,.docx,.mobi" id="upload-file"
                style="background:#0a0a0a;border:1px solid #333;color:#c0c0c0;padding:8px;width:100%;font-family:inherit;">
            <span class="text-dim" style="font-size:11px;display:block;margin-top:4px;">Supported: PDF, TXT, EPUB, DOC, DOCX, MOBI</span>
        </div>
        <div class="mb-16">
            <label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">Category</label>
@ -67,7 +68,7 @@ document.getElementById('upload-form').addEventListener('submit', async function
            result.innerHTML = '<span style="color:#00ff41;">Queued for processing</span><br>' +
                '<span class="text-dim">Hash: ' + data.hash + '</span><br>' +
                '<span class="text-dim">File: ' + data.filename + '</span><br>' +
-                '<span class="text-dim">Category: ' + data.source + '/' + data.category + '</span>';
+                '<span class="text-dim">Type: ' + data.source_type + '</span>';
            fileInput.value = '';
        } else {
            status.style.color = '#ff4444';