From da50e5f0b8f0be3f1e42b0b7f14f729834c66c62 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sat, 18 Apr 2026 18:26:43 +0000
Subject: [PATCH 01/11] Add scraper Phase 2: smart crawl mode detection +
 browser fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Pre-flight detection: wget + Playwright probe to auto-detect if site
  needs browser rendering (JS apps, parking page redirects)
- SingleFile CLI crawl backend for JS-rendered sites
- crawl_mode column in scrape_jobs (static/browser/redirect/auto)
- API: optional crawl_mode param on submit, cleared on retry
- Config: rate_limit_delay 2.0→0.5, /api/ reject pattern, preflight
  + singlefile config sections
- Prerequisites: Node.js 22, single-file-cli, Playwright + Chromium

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 config.yaml           |  78 +++++
 lib/api.py            | 108 +++++++
 lib/scraper_runner.py | 695 ++++++++++++++++++++++++++++++++++++++++++
 lib/status.py         |  85 ++++++
 recon.py              |  11 +
 5 files changed, 977 insertions(+)
 create mode 100644 lib/scraper_runner.py

diff --git a/config.yaml b/config.yaml
index 4b147fd..c98a866 100644
--- a/config.yaml
+++ b/config.yaml
@@ -413,6 +413,84 @@ peertube:
   rate_limit_delay: 0.5                 # Delay between video ingestions (seconds)
   poll_interval: 1800                    # Seconds between PeerTube acquisition polls (30 min)
 
+scraper:
+  workspace: /opt/recon/data/scraper      # Working directory for wget mirrors + ZIM builds
+  output_dir: /mnt/kiwix                  # Finished .zim files land here (kiwix-serve library)
+  rate_limit_delay: 0.5                   # Seconds between wget requests (--wait)
+  wait_random: 1.0                        # Random jitter added to wait (--random-wait range)
+  default_language: eng                   # ISO 639-3 language code for ZIM metadata
+  user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
+  poll_interval: 300                      # Seconds between checking for pending scrape jobs
+  keep_workspace_on_failure: true         # Retain workspace for debugging when a job fails
+
+  # Default URL patterns rejected by wget --reject-regex.
+  # Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
+  # Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
+  default_reject_patterns:
+    # WordPress
+    - '\?share='
+    - '\?replytocom='
+    - '\?like_comment='
+    - '/feed/'
+    - '/wp-json/'
+    - '/wp-login'
+    - '/wp-admin'
+    - '/wp-cron'
+    - '\?attachment_id='
+    - '/xmlrpc'
+    - '/trackback'
+    - '/comment-page-'
+    - '\?doing_wp_cron'
+    # Squarespace
+    - '\?format=json'
+    - '\?format=rss'
+    - '/api/'
+    # Wix
+    - '/_api/'
+    - '/_partials/'
+    # Ghost
+    - '/ghost/'
+    - '/p/'
+    # Drupal
+    - '\?q=comment'
+    - '\?q=node'
+    - '/user/login'
+    - '/user/register'
+    # General CMS / site chrome
+    - '/login'
+    - '/signup'
+    - '/register'
+    - '/cart'
+    - '/checkout'
+    - '/search\?'
+    - '/tag/'
+    - '/author/'
+    - '\?print='
+    - '\?pdf='
+    - '\?format=amp'
+    - '\?preview='
+    - '/rss'
+    - '/atom'
+    - '/cdn-cgi/'
+
+  # Pre-flight mode detection
+  preflight:
+    enabled: true
+    timeout: 30                    # Seconds for single-page Playwright fetch
+    min_static_size: 5120          # 5KB - wget HTML below this = suspect JS site
+    min_browser_size: 20480        # 20KB - browser HTML above this confirms JS
+    spa_markers:
+      - 'div#root'
+      - 'div#app'
+      - 'div#__next'
+
+  # SingleFile CLI settings (browser crawl mode)
+  singlefile:
+    executable: single-file
+    chromium_path: ""              # Auto-detected from Playwright if empty
+    crawl_max_depth: 10
+    crawl_delay: 2                 # Seconds between page fetches
+
 # Stream B: New Library Pipeline
 new_pipeline:
   # Disabled 2026-04-14 for refactor — see refactored-recon repo for context
diff --git a/lib/api.py b/lib/api.py
index a739ec0..cbb3377 100644
--- a/lib/api.py
+++ b/lib/api.py
@@ -2256,6 +2256,114 @@ def _build_kiwix_sources():
     }
 
 
+
+
+# ── Scraper API ──
+
+@app.route('/api/scraper/submit', methods=['POST'])
+def api_scraper_submit():
+    """Submit a new web scrape job."""
+    data = request.get_json(silent=True) or {}
+    url = (data.get('url') or '').strip()
+
+    if not url:
+        return jsonify({'error': 'url is required'}), 400
+    if not url.startswith(('http://', 'https://')):
+        return jsonify({'error': 'URL must start with http:// or https://'}), 400
+
+    config = get_config()
+    scraper_cfg = config.get('scraper', {})
+    language = data.get('language') or scraper_cfg.get('default_language', 'eng')
+    title = data.get('title', '').strip() or None
+    category = data.get('category', '').strip() or None
+
+    # Optional per-job reject pattern overrides
+    additional_reject_patterns = data.get('additional_reject_patterns')
+    skip_default_patterns = bool(data.get('skip_default_patterns', False))
+
+    # Optional crawl mode override (static, browser, redirect, or null for auto-detect)
+    crawl_mode = data.get('crawl_mode')
+    if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'):
+        return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400
+
+    # Serialize additional patterns as JSON if provided
+    import json as _json
+    additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None
+
+    db = StatusDB()
+    conn = db._get_conn()
+    conn.execute(
+        "INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)",
+        (url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode)
+    )
+    conn.commit()
+    job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
+
+    logger.info(f"Scraper job {job_id} submitted: {url}")
+    return jsonify({'ok': True, 'job_id': job_id}), 201
+
+
+@app.route('/api/scraper/jobs')
+def api_scraper_jobs():
+    """List scrape jobs, optionally filtered by status."""
+    status_filter = request.args.get('status')
+    db = StatusDB()
+    jobs = db.get_scrape_jobs(status=status_filter)
+    return jsonify({'jobs': jobs})
+
+
+@app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
+def api_scraper_cancel(job_id):
+    """Cancel a scrape job."""
+    import os as _os
+    import signal as _signal
+
+    db = StatusDB()
+    job = db.get_scrape_job(job_id)
+    if not job:
+        return jsonify({'error': 'Job not found'}), 404
+
+    if job['status'] in ('complete', 'cancelled'):
+        return jsonify({'error': f"Job already {job['status']}"}), 400
+
+    # Set cancelled in DB — the runner loop checks this between phases
+    db.update_scrape_job(job_id, status='cancelled')
+
+    # If there's an active subprocess, send SIGTERM
+    pid = job.get('subprocess_pid')
+    if pid:
+        try:
+            _os.kill(pid, _signal.SIGTERM)
+        except (ProcessLookupError, PermissionError):
+            pass  # Process already gone
+
+    logger.info(f"Scraper job {job_id} cancelled")
+    return jsonify({'ok': True})
+
+
+@app.route('/api/scraper/retry/<int:job_id>', methods=['POST'])
+def api_scraper_retry(job_id):
+    """Retry a failed or cancelled scrape job."""
+    db = StatusDB()
+    job = db.get_scrape_job(job_id)
+    if not job:
+        return jsonify({'error': 'Job not found'}), 404
+
+    if job['status'] not in ('failed', 'cancelled'):
+        return jsonify({'error': f"Job status is '{job['status']}', can only retry failed or cancelled jobs"}), 400
+
+    db.update_scrape_job(job_id,
+                         status='pending',
+                         error_message=None,
+                         subprocess_pid=None,
+                         crawl_mode=None,
+                         started_at=None,
+                         completed_at=None)
+
+    logger.info(f"Scraper job {job_id} reset to pending for retry")
+    return jsonify({'ok': True})
+
+
 # ── Metrics API ──
 
 @app.route('/api/metrics/history')
diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
new file mode 100644
index 0000000..1599f2e
--- /dev/null
+++ b/lib/scraper_runner.py
@@ -0,0 +1,695 @@
+"""
+RECON Scraper Runner
+
+Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage.
+Supports two crawl backends:
+  - wget (static sites) — default
+  - SingleFile CLI (JS-rendered sites) — browser mode
+
+Pre-flight detection automatically chooses the right backend unless
+crawl_mode is pre-set on the job.
+
+Public entry point: scraper_loop(stop_event, config).
+
+Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile)
+DB table: scrape_jobs (status flow: pending → scraping → packaging → complete)
+"""
+import glob as _glob
+import json as _json
+import os
+import re
+import shutil
+import signal
+import subprocess
+import tempfile
+import time
+from datetime import datetime, timezone
+from urllib.parse import urlparse
+
+from .utils import setup_logging
+from .status import StatusDB
+
+logger = setup_logging('recon.scraper_runner')
+
+
+def scraper_loop(stop_event, config):
+    """Daemon loop: poll for pending scrape jobs, execute pipeline."""
+    scraper_cfg = config.get('scraper', {})
+    poll_interval = scraper_cfg.get('poll_interval', 300)
+
+    logger.info("Scraper runner started")
+
+    while not stop_event.is_set():
+        db = StatusDB()
+        job = db.get_pending_scrape_job()
+        if job:
+            try:
+                _process_job(job, config, stop_event)
+            except Exception as e:
+                logger.error(f"Scraper job {job['id']} unexpected error: {e}", exc_info=True)
+                try:
+                    db.update_scrape_job(job['id'],
+                                         status='failed',
+                                         error_message=str(e)[:1000],
+                                         subprocess_pid=None,
+                                         completed_at=_now())
+                except Exception:
+                    pass
+        else:
+            stop_event.wait(poll_interval)
+
+    logger.info("Scraper runner stopped")
+
+
+def _now():
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _sanitize_domain(url):
+    """Extract and sanitize domain from URL for use in filenames."""
+    parsed = urlparse(url)
+    domain = parsed.hostname or 'unknown'
+    if domain.startswith('www.'):
+        domain = domain[4:]
+    return domain
+
+
+def _sanitize_filename(s):
+    """Sanitize a string for safe filename use."""
+    return re.sub(r'[^a-zA-Z0-9._-]', '_', s)
+
+
+def _check_cancelled(db, job_id):
+    """Check if a job has been cancelled in the DB."""
+    job = db.get_scrape_job(job_id)
+    return job and job['status'] == 'cancelled'
+
+
+def _kill_process(proc, timeout=5):
+    """Gracefully terminate a subprocess, force kill if needed."""
+    if proc.poll() is not None:
+        return
+    try:
+        proc.terminate()
+        proc.wait(timeout=timeout)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        proc.wait(timeout=2)
+
+
+def _count_html_files(directory):
+    """Count HTML files in a directory tree."""
+    count = 0
+    for root, dirs, files in os.walk(directory):
+        for f in files:
+            if f.lower().endswith(('.html', '.htm')):
+                count += 1
+    return count
+
+
+def _find_welcome_page(content_dir, domain):
+    """Find the welcome page (index.html) in the wget mirror."""
+    domain_dir = None
+    for entry in os.listdir(content_dir):
+        entry_path = os.path.join(content_dir, entry)
+        if os.path.isdir(entry_path):
+            domain_dir = entry_path
+            break
+
+    if not domain_dir:
+        return None, content_dir
+
+    for candidate in ['index.html', 'index.htm']:
+        path = os.path.join(domain_dir, candidate)
+        if os.path.isfile(path):
+            return candidate, domain_dir
+
+    for root, dirs, files in os.walk(domain_dir):
+        for f in sorted(files):
+            if f.lower().endswith(('.html', '.htm')):
+                rel = os.path.relpath(os.path.join(root, f), domain_dir)
+                return rel, domain_dir
+
+    return 'index.html', domain_dir
+
+
+def _create_placeholder_illustration(path):
+    """Create a 48x48 placeholder PNG for zimwriterfs --illustration."""
+    from PIL import Image
+    img = Image.new('RGB', (48, 48), color=(40, 192, 232))
+    img.save(path, 'PNG')
+
+
+# ── Crawl mode detection ──────────────────────────────────────────
+
+
+def _get_chromium_path(config):
+    """Auto-detect Chromium from Playwright's cache, or use config override."""
+    configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '')
+    if configured and os.path.isfile(configured):
+        return configured
+    # Playwright stores Chromium — check both root and user caches
+    search_paths = [
+        os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'),
+        '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome',
+    ]
+    for pattern in search_paths:
+        matches = sorted(_glob.glob(pattern))
+        if matches:
+            return matches[-1]
+    return None
+
+
+def _detect_crawl_mode(url, config):
+    """
+    Pre-flight detection: determine whether a URL needs a browser to crawl.
+
+    Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'.
+    'redirect' means the URL redirected to a different domain (parking page etc.);
+    resolved_url will be the final browser URL in that case.
+    """
+    preflight_cfg = config.get('scraper', {}).get('preflight', {})
+    if not preflight_cfg.get('enabled', True):
+        return 'static', url
+
+    timeout = preflight_cfg.get('timeout', 30)
+    min_static = preflight_cfg.get('min_static_size', 5120)
+    min_browser = preflight_cfg.get('min_browser_size', 20480)
+    spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next'])
+
+    input_domain = urlparse(url).hostname or ''
+    if input_domain.startswith('www.'):
+        input_domain = input_domain[4:]
+
+    # Step 1: wget single-page fetch
+    wget_html = ''
+    wget_size = 0
+    try:
+        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp:
+            tmp_path = tmp.name
+        result = subprocess.run(
+            ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url],
+            capture_output=True, text=True, timeout=timeout + 5
+        )
+        if os.path.isfile(tmp_path):
+            wget_size = os.path.getsize(tmp_path)
+            with open(tmp_path, 'r', errors='replace') as f:
+                wget_html = f.read()
+        os.unlink(tmp_path)
+    except Exception as e:
+        logger.debug(f"Preflight wget failed for {url}: {e}")
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+
+    # Step 2: Playwright headless fetch
+    browser_html = ''
+    browser_size = 0
+    browser_url = url
+    try:
+        from playwright.sync_api import sync_playwright
+        with sync_playwright() as p:
+            browser = p.chromium.launch(
+                headless=True,
+                args=['--no-sandbox', '--disable-dev-shm-usage']
+            )
+            page = browser.new_page()
+            page.goto(url, wait_until='networkidle', timeout=timeout * 1000)
+            browser_url = page.url
+            browser_html = page.content()
+            browser_size = len(browser_html.encode('utf-8'))
+            browser.close()
+    except Exception as e:
+        logger.debug(f"Preflight Playwright failed for {url}: {e}")
+        # If Playwright fails entirely, fall back to static
+        return 'static', url
+
+    # Step 3: Decision logic
+    browser_domain = urlparse(browser_url).hostname or ''
+    if browser_domain.startswith('www.'):
+        browser_domain = browser_domain[4:]
+
+    # Check for cross-domain redirect (parking page detection)
+    if browser_domain and input_domain and browser_domain != input_domain:
+        logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect")
+        return 'redirect', browser_url
+
+    # Check size disparity: small wget + large browser = JS-rendered
+    if wget_size < min_static and browser_size > min_browser:
+        logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser")
+        return 'browser', url
+
+    # Check for SPA shell markers in wget HTML
+    if wget_html:
+        try:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(wget_html, 'html.parser')
+            for marker in spa_markers:
+                # marker is like 'div#root' — split tag and id
+                parts = marker.split('#', 1)
+                tag = parts[0] if parts[0] else 'div'
+                elem_id = parts[1] if len(parts) > 1 else None
+                elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag)
+                if elem:
+                    text_content = elem.get_text(strip=True)
+                    if len(text_content) < 100:
+                        logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser")
+                        return 'browser', url
+        except Exception as e:
+            logger.debug(f"Preflight SPA marker check failed: {e}")
+
+    logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static")
+    return 'static', url
+
+
+# ── Crawl backends ────────────────────────────────────────────────
+
+
+def _crawl_wget(job, url, site_dir, config, stop_event, db):
+    """
+    wget mirror crawl backend.
+    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
+    """
+    job_id = job['id']
+    scraper_cfg = config.get('scraper', {})
+    rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5)
+    user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)')
+    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
+    workspace = os.path.dirname(site_dir)
+
+    # Build reject-regex from config defaults + per-job overrides
+    reject_patterns = []
+    skip_defaults = bool(job.get('skip_default_patterns'))
+    if not skip_defaults:
+        reject_patterns.extend(scraper_cfg.get('default_reject_patterns', []))
+    additional_raw = job.get('additional_reject_patterns')
+    if additional_raw:
+        try:
+            additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw
+            if isinstance(additional, list):
+                reject_patterns.extend(additional)
+        except (ValueError, TypeError):
+            pass
+
+    wget_cmd = [
+        'wget', '--mirror', '--convert-links', '--adjust-extension',
+        '--page-requisites', '--no-parent',
+        '--restrict-file-names=windows',
+        f'--wait={rate_limit_delay}', '--random-wait',
+        f'--user-agent={user_agent}',
+        f'--directory-prefix={site_dir}',
+        '--timeout=30', '--tries=3',
+    ]
+    if reject_patterns:
+        combined_regex = '|'.join(f'({p})' for p in reject_patterns)
+        wget_cmd.extend([f'--reject-regex={combined_regex}'])
+        logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns")
+    wget_cmd.append(url)
+
+    logger.info(f"Job {job_id}: wget mirror starting")
+    wget_log = os.path.join(workspace, 'wget.log')
+    try:
+        with open(wget_log, 'w') as log_fh:
+            proc = subprocess.Popen(
+                wget_cmd,
+                stdout=log_fh, stderr=subprocess.STDOUT,
+            )
+        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
+
+        while proc.poll() is None:
+            if stop_event.is_set() or _check_cancelled(db, job_id):
+                _kill_process(proc)
+                return 0, 'cancelled'
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                pass
+
+        db.update_scrape_job(job_id, subprocess_pid=None)
+
+        if stop_event.is_set() or _check_cancelled(db, job_id):
+            return 0, 'cancelled'
+
+        # wget returns 8 for some server errors but may still have useful content
+        if proc.returncode not in (0, 4, 6, 8):
+            output = ''
+            try:
+                with open(wget_log, 'r') as f:
+                    f.seek(max(0, os.path.getsize(wget_log) - 500))
+                    output = f.read()
+            except Exception:
+                pass
+            return 0, f"wget failed with code {proc.returncode}: {output[-500:]}"
+
+    except Exception as e:
+        return 0, f"wget error: {e}"
+
+    page_count = _count_html_files(site_dir)
+    logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found")
+
+    if page_count == 0:
+        return 0, 'wget produced no HTML files'
+
+    return page_count, None
+
+
+def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
+    """
+    SingleFile CLI crawl backend for JS-rendered sites.
+    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
+    """
+    job_id = job['id']
+    scraper_cfg = config.get('scraper', {})
+    sf_cfg = scraper_cfg.get('singlefile', {})
+    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
+    workspace = os.path.dirname(site_dir)
+
+    executable = sf_cfg.get('executable', 'single-file')
+    chromium_path = _get_chromium_path(config)
+    crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
+    crawl_delay = sf_cfg.get('crawl_delay', 2)
+
+    if not chromium_path:
+        return 0, 'Chromium not found — cannot use browser crawl mode'
+
+    # SingleFile outputs into site_dir/<domain>/ to match wget's structure
+    domain = _sanitize_domain(url)
+    output_dir = os.path.join(site_dir, domain)
+    os.makedirs(output_dir, exist_ok=True)
+
+    sf_cmd = [
+        executable,
+        '--crawl-links=true',
+        '--crawl-inner-links-only=true',
+        f'--crawl-max-depth={crawl_max_depth}',
+        f'--crawl-delay={crawl_delay * 1000}',  # milliseconds
+        f'--browser-executable-path={chromium_path}',
+        '--browser-headless=true',
+        '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
+        f'--output-directory={output_dir}',
+        url,
+    ]
+
+    logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)")
+    sf_log = os.path.join(workspace, 'singlefile.log')
+    try:
+        with open(sf_log, 'w') as log_fh:
+            proc = subprocess.Popen(
+                sf_cmd,
+                stdout=log_fh, stderr=subprocess.STDOUT,
+            )
+        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
+
+        while proc.poll() is None:
+            if stop_event.is_set() or _check_cancelled(db, job_id):
+                _kill_process(proc)
+                return 0, 'cancelled'
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                pass
+
+        db.update_scrape_job(job_id, subprocess_pid=None)
+
+        if stop_event.is_set() or _check_cancelled(db, job_id):
+            return 0, 'cancelled'
+
+        if proc.returncode != 0:
+            output = ''
+            try:
+                with open(sf_log, 'r') as f:
+                    f.seek(max(0, os.path.getsize(sf_log) - 500))
+                    output = f.read()
+            except Exception:
+                pass
+            # SingleFile may still produce some files even with non-zero exit
+            page_count = _count_html_files(site_dir)
+            if page_count == 0:
+                return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}"
+            logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing")
+
+    except Exception as e:
+        return 0, f"SingleFile error: {e}"
+
+    # If no index.html exists, rename the first HTML file to index.html
+    index_path = os.path.join(output_dir, 'index.html')
+    if not os.path.isfile(index_path):
+        for f in sorted(os.listdir(output_dir)):
+            if f.lower().endswith(('.html', '.htm')):
+                src = os.path.join(output_dir, f)
+                os.rename(src, index_path)
+                logger.info(f"Job {job_id}: renamed {f} → index.html")
+                break
+
+    page_count = _count_html_files(site_dir)
+    logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found")
+
+    if page_count == 0:
+        return 0, 'SingleFile produced no HTML files'
+
+    return page_count, None
+
+
+# ── Main job pipeline ─────────────────────────────────────────────
+
+
+def _process_job(job, config, stop_event):
+    """Execute the full scrape pipeline for a single job."""
+    db = StatusDB()
+    job_id = job['id']
+    url = job['url']
+    title = job.get('title') or _sanitize_domain(url)
+    language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
+    category = job.get('category') or ''
+
+    scraper_cfg = config.get('scraper', {})
+    workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper')
+    output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
+    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
+
+    workspace = os.path.join(workspace_root, str(job_id))
+    site_dir = os.path.join(workspace, 'site')
+    os.makedirs(site_dir, exist_ok=True)
+
+    domain = _sanitize_domain(url)
+    date_tag = datetime.now().strftime('%Y-%m')
+    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
+    zim_path = os.path.join(output_dir, zim_filename)
+
+    logger.info(f"Job {job_id}: starting scrape of {url}")
+    db.update_scrape_job(job_id,
+                         status='scraping',
+                         workspace_path=workspace,
+                         started_at=_now())
+
+    # ── Phase 0: Pre-flight mode detection ─────────────────────────
+    if stop_event.is_set() or _check_cancelled(db, job_id):
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+
+    pre_set = job.get('crawl_mode')
+    if pre_set:
+        crawl_mode, resolved_url = pre_set, url
+        logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}")
+    else:
+        crawl_mode, resolved_url = _detect_crawl_mode(url, config)
+        logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}")
+
+    db.update_scrape_job(job_id, crawl_mode=crawl_mode)
+
+    # If redirect detected, update domain/filename to match resolved URL
+    if crawl_mode == 'redirect' and resolved_url != url:
+        logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
+        domain = _sanitize_domain(resolved_url)
+        zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
+        zim_path = os.path.join(output_dir, zim_filename)
+
+    # ── Phase A: Crawl (dispatch to backend) ────────────────────────
+    if stop_event.is_set() or _check_cancelled(db, job_id):
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+
+    if crawl_mode == 'browser':
+        page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db)
+    else:  # 'static' or 'redirect'
+        page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db)
+
+    if error == 'cancelled':
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+    elif error:
+        db.update_scrape_job(job_id,
+                             status='failed',
+                             error_message=error,
+                             subprocess_pid=None,
+                             completed_at=_now())
+        if not keep_workspace:
+            shutil.rmtree(workspace, ignore_errors=True)
+        return
+
+    db.update_scrape_job(job_id, page_count=page_count)
+
+    # ── Phase B: Prepare zimwriterfs inputs ────────────────────────
+    if stop_event.is_set() or _check_cancelled(db, job_id):
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+
+    welcome_page, content_dir = _find_welcome_page(site_dir, domain)
+    if welcome_page is None:
+        welcome_page = 'index.html'
+
+    illustration_path = os.path.join(workspace, 'illustration.png')
+    _create_placeholder_illustration(illustration_path)
+    illust_dest = os.path.join(content_dir, 'illustration.png')
+    shutil.copy2(illustration_path, illust_dest)
+
+    description = f"Mirror of {domain}"
+    if category:
+        description = f"{category} — mirror of {domain}"
+
+    logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})")
+    db.update_scrape_job(job_id, status='packaging')
+
+    # ── Phase C: zimwriterfs ───────────────────────────────────────
+    if stop_event.is_set() or _check_cancelled(db, job_id):
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+
+    zim_name = _sanitize_filename(domain)
+    long_description = f"Offline mirror of {resolved_url} created by RECON web scraper"
+
+    zim_cmd = [
+        'zimwriterfs',
+        f'--welcome={welcome_page}',
+        f'--illustration=illustration.png',
+        f'--language={language}',
+        f'--title={title}',
+        f'--description={description[:80]}',
+        f'--longDescription={long_description[:4096]}',
+        f'--name={zim_name}',
+        f'--creator={domain}',
+        '--publisher=RECON',
+        content_dir,
+        zim_path,
+    ]
+
+    zim_log = os.path.join(workspace, 'zimwriterfs.log')
+    try:
+        with open(zim_log, 'w') as log_fh:
+            proc = subprocess.Popen(
+                zim_cmd,
+                stdout=log_fh, stderr=subprocess.STDOUT,
+            )
+        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
+
+        while proc.poll() is None:
+            if stop_event.is_set() or _check_cancelled(db, job_id):
+                _kill_process(proc)
+                _handle_cancel(db, job_id, workspace, keep_workspace)
+                return
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                pass
+
+        db.update_scrape_job(job_id, subprocess_pid=None)
+
+        if stop_event.is_set() or _check_cancelled(db, job_id):
+            _handle_cancel(db, job_id, workspace, keep_workspace)
+            return
+
+        if proc.returncode != 0:
+            output = ''
+            try:
+                with open(zim_log, 'r') as f:
+                    f.seek(max(0, os.path.getsize(zim_log) - 500))
+                    output = f.read()
+            except Exception:
+                pass
+            raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}")
+
+    except RuntimeError:
+        raise
+    except Exception as e:
+        db.update_scrape_job(job_id,
+                             status='failed',
+                             error_message=f"zimwriterfs error: {e}",
+                             subprocess_pid=None,
+                             completed_at=_now())
+        if not keep_workspace:
+            shutil.rmtree(workspace, ignore_errors=True)
+        return
+
+    if not os.path.isfile(zim_path):
+        db.update_scrape_job(job_id,
+                             status='failed',
+                             error_message='zimwriterfs produced no output file',
+                             completed_at=_now())
+        return
+
+    logger.info(f"Job {job_id}: ZIM created at {zim_path}")
+
+    # ── Phase D: kiwix-manage + registration ───────────────────────
+    if stop_event.is_set() or _check_cancelled(db, job_id):
+        _handle_cancel(db, job_id, workspace, keep_workspace)
+        return
+
+    kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
+    library_xml = '/mnt/kiwix/library.xml'
+
+    try:
+        subprocess.run(
+            [kiwix_manage, library_xml, 'add', zim_path],
+            capture_output=True, text=True, timeout=30
+        )
+        logger.info(f"Job {job_id}: registered with kiwix-serve library")
+    except Exception as e:
+        logger.warning(f"Job {job_id}: kiwix-manage add failed: {e}")
+
+    try:
+        result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
+        if result.returncode == 0 and result.stdout.strip():
+            pid = int(result.stdout.strip().split()[0])
+            os.kill(pid, signal.SIGHUP)
+            logger.info(f"Job {job_id}: sent SIGHUP to kiwix-serve (pid {pid})")
+    except Exception as e:
+        logger.warning(f"Job {job_id}: failed to signal kiwix-serve: {e}")
+
+    zim_source_id = None
+    try:
+        from .zim_monitor import scan_zims
+        scan_zims()
+        conn = db._get_conn()
+        row = conn.execute(
+            "SELECT id FROM zim_sources WHERE zim_filename = ?", (zim_filename,)
+        ).fetchone()
+        if row:
+            zim_source_id = row['id']
+            logger.info(f"Job {job_id}: linked to zim_source_id={zim_source_id}")
+    except Exception as e:
+        logger.warning(f"Job {job_id}: scan_zims failed: {e}")
+
+    try:
+        shutil.rmtree(workspace, ignore_errors=True)
+    except Exception:
+        pass
+
+    db.update_scrape_job(job_id,
+                         status='complete',
+                         zim_filename=zim_filename,
+                         zim_source_id=zim_source_id,
+                         completed_at=_now())
+
+    logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})")
+
+
+def _handle_cancel(db, job_id, workspace, keep_workspace):
+    """Handle job cancellation: clean up and update status."""
+    logger.info(f"Job {job_id}: cancelled")
+    db.update_scrape_job(job_id,
+                         status='cancelled',
+                         subprocess_pid=None,
+                         completed_at=_now())
+    if not keep_workspace:
+        shutil.rmtree(workspace, ignore_errors=True)
diff --git a/lib/status.py b/lib/status.py
index 20cc77b..974cabd 100644
--- a/lib/status.py
+++ b/lib/status.py
@@ -105,6 +105,25 @@ class StatusDB:
         except Exception:
             pass  # column already exists
 
+        # Migration: add subprocess_pid column to scrape_jobs if missing
+        try:
+            conn.execute("ALTER TABLE scrape_jobs ADD COLUMN subprocess_pid INTEGER")
+        except Exception:
+            pass  # column already exists
+
+        # Migration: add reject pattern columns to scrape_jobs if missing
+        for col, coltype in [('additional_reject_patterns', 'TEXT'), ('skip_default_patterns', 'INTEGER DEFAULT 0')]:
+            try:
+                conn.execute(f"ALTER TABLE scrape_jobs ADD COLUMN {col} {coltype}")
+            except Exception:
+                pass  # column already exists
+
+        # Migration: add crawl_mode column to scrape_jobs if missing
+        try:
+            conn.execute("ALTER TABLE scrape_jobs ADD COLUMN crawl_mode TEXT")
+        except Exception:
+            pass  # column already exists
+
         # Stream B: file_operations + duplicate_review tables
         conn.executescript("""
             CREATE TABLE IF NOT EXISTS file_operations (
@@ -142,6 +161,28 @@ class StatusDB:
                 resolved_at TEXT
             );
             CREATE INDEX IF NOT EXISTS idx_dupreview_status ON duplicate_review(status);
+
+            CREATE TABLE IF NOT EXISTS scrape_jobs (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                url TEXT NOT NULL,
+                title TEXT,
+                language TEXT DEFAULT 'eng',
+                category TEXT,
+                status TEXT DEFAULT 'pending',
+                page_count INTEGER DEFAULT 0,
+                error_message TEXT,
+                zim_filename TEXT,
+                zim_source_id INTEGER,
+                workspace_path TEXT,
+                subprocess_pid INTEGER,
+                additional_reject_patterns TEXT,
+                skip_default_patterns INTEGER DEFAULT 0,
+                crawl_mode TEXT,
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+                started_at TEXT,
+                completed_at TEXT
+            );
+            CREATE INDEX IF NOT EXISTS idx_scrape_status ON scrape_jobs(status);
         """)
         conn.commit()
 
@@ -406,6 +447,50 @@ class StatusDB:
         )
         conn.commit()
 
+
+    # ── Scraper Job Helpers ─────────────────────────────────────
+
+    def get_pending_scrape_job(self):
+        """Fetch the oldest pending scrape job."""
+        conn = self._get_conn()
+        row = conn.execute(
+            "SELECT * FROM scrape_jobs WHERE status = 'pending' ORDER BY id ASC LIMIT 1"
+        ).fetchone()
+        return dict(row) if row else None
+
+    def update_scrape_job(self, job_id, **kwargs):
+        """Update arbitrary columns on a scrape job."""
+        if not kwargs:
+            return
+        conn = self._get_conn()
+        sets = []
+        vals = []
+        for k, v in kwargs.items():
+            sets.append(f"{k} = ?")
+            vals.append(v)
+        vals.append(job_id)
+        conn.execute(f"UPDATE scrape_jobs SET {', '.join(sets)} WHERE id = ?", vals)
+        conn.commit()
+
+    def get_scrape_jobs(self, status=None):
+        """List scrape jobs, optionally filtered by status."""
+        conn = self._get_conn()
+        if status:
+            rows = conn.execute(
+                "SELECT * FROM scrape_jobs WHERE status = ? ORDER BY id DESC", (status,)
+            ).fetchall()
+        else:
+            rows = conn.execute(
+                "SELECT * FROM scrape_jobs ORDER BY id DESC"
+            ).fetchall()
+        return [dict(r) for r in rows]
+
+    def get_scrape_job(self, job_id):
+        """Get a single scrape job by ID."""
+        conn = self._get_conn()
+        row = conn.execute("SELECT * FROM scrape_jobs WHERE id = ?", (job_id,)).fetchone()
+        return dict(row) if row else None
+
     # ── Stream B: File Operations ───────────────────────────────────
 
     def log_file_operation(self, doc_hash, operation, source_path, target_path,
diff --git a/recon.py b/recon.py
index 47dda7d..9635a59 100755
--- a/recon.py
+++ b/recon.py
@@ -692,12 +692,23 @@ def cmd_service(args):
                          daemon=True, name='dashboard'),
     ]
 
+    # Scraper daemon: polls for pending scrape jobs, runs wget+zimwriterfs pipeline
+    scraper_cfg = config.get('scraper', {})
+    if scraper_cfg.get('workspace'):
+        from lib.scraper_runner import scraper_loop
+        threads.append(
+            threading.Thread(target=lambda: scraper_loop(stop_event, config),
+                             daemon=True, name='scraper')
+        )
+
     logger.info("=== RECON Service Starting ===")
     logger.info(f"  Dashboard: {web_host}:{web_port}")
     logger.info(f"  Workers: enrich={enrich_workers}, embed={embed_workers}")
     logger.info(f"  Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
     pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
     logger.info(f"  PeerTube acquisition: every {pt_interval}s")
+    if scraper_cfg.get('workspace'):
+        logger.info(f"  Scraper: every {scraper_cfg.get('poll_interval', 300)}s")
     logger.info(f"  Progress: every {progress_interval}s")
 
     for t in threads:

From 125602fa1369ea0b1ec7a98406e2321473e428d1 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sat, 18 Apr 2026 19:28:03 +0000
Subject: [PATCH 02/11] Fix SingleFile CLI: remove invalid --crawl-delay flag

SingleFile CLI has no --crawl-delay option. The invalid flag caused the
process to print help and exit with no output. Added --crawl-no-parent
and --crawl-replace-URLs instead. Removed unused crawl_delay config key.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 config.yaml           | 1 -
 lib/scraper_runner.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/config.yaml b/config.yaml
index c98a866..bdabf69 100644
--- a/config.yaml
+++ b/config.yaml
@@ -489,7 +489,6 @@ scraper:
     executable: single-file
     chromium_path: ""              # Auto-detected from Playwright if empty
     crawl_max_depth: 10
-    crawl_delay: 2                 # Seconds between page fetches
 
 # Stream B: New Library Pipeline
 new_pipeline:
diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index 1599f2e..a3ff820 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -368,7 +368,6 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
     executable = sf_cfg.get('executable', 'single-file')
     chromium_path = _get_chromium_path(config)
     crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
-    crawl_delay = sf_cfg.get('crawl_delay', 2)
 
     if not chromium_path:
         return 0, 'Chromium not found — cannot use browser crawl mode'
@@ -382,8 +381,9 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
         executable,
         '--crawl-links=true',
         '--crawl-inner-links-only=true',
+        '--crawl-no-parent=true',
+        '--crawl-replace-URLs=true',
         f'--crawl-max-depth={crawl_max_depth}',
-        f'--crawl-delay={crawl_delay * 1000}',  # milliseconds
         f'--browser-executable-path={chromium_path}',
         '--browser-headless=true',
         '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
@@ -391,7 +391,7 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
         url,
     ]
 
-    logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)")
+    logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})")
     sf_log = os.path.join(workspace, 'singlefile.log')
     try:
         with open(sf_log, 'w') as log_fh:

From 45b954fccc3b60ecf182e6ea55f12692916b894d Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sat, 18 Apr 2026 20:17:53 +0000
Subject: [PATCH 03/11] Fix ZIM filename collisions by appending job ID

Format: {domain}_{lang}_{YYYY-MM}_{job_id}.zim
Prevents zimwriterfs failures when the same domain is scraped
multiple times in the same month.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index a3ff820..280b874 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -474,7 +474,7 @@ def _process_job(job, config, stop_event):
 
     domain = _sanitize_domain(url)
     date_tag = datetime.now().strftime('%Y-%m')
-    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
+    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
     zim_path = os.path.join(output_dir, zim_filename)
 
     logger.info(f"Job {job_id}: starting scrape of {url}")
@@ -502,7 +502,7 @@ def _process_job(job, config, stop_event):
     if crawl_mode == 'redirect' and resolved_url != url:
         logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
         domain = _sanitize_domain(resolved_url)
-        zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
+        zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
         zim_path = os.path.join(output_dir, zim_filename)
 
     # ── Phase A: Crawl (dispatch to backend) ────────────────────────

From 1ce9a3731f566aaba230b91905477dfb37a2b636 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sat, 18 Apr 2026 20:47:17 +0000
Subject: [PATCH 04/11] Add scraper dashboard UI under Kiwix tab

New /kiwix/scraper page with submit form (URL, title, language,
crawl mode), stats cards, and auto-refreshing jobs table with
cancel/retry actions. Kiwix section now has Library/Scraper subnav.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/api.py                   |  11 ++-
 static/css/recon.css         |   1 +
 static/js/scraper.js         | 155 +++++++++++++++++++++++++++++++++++
 templates/kiwix/scraper.html |  91 ++++++++++++++++++++
 4 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 static/js/scraper.js
 create mode 100644 templates/kiwix/scraper.html

diff --git a/lib/api.py b/lib/api.py
index cbb3377..aa13a39 100644
--- a/lib/api.py
+++ b/lib/api.py
@@ -60,7 +60,10 @@ PEERTUBE_SUBNAV = [
 ]
 
 
-KIWIX_SUBNAV = []  # Single-page, no subnav needed
+KIWIX_SUBNAV = [
+    {'href': '/kiwix', 'label': 'Library'},
+    {'href': '/kiwix/scraper', 'label': 'Scraper'},
+]
 SETTINGS_SUBNAV = [
     {'href': '/settings/keys', 'label': 'API Keys'},
     {'href': '/settings/cookies', 'label': 'YouTube Cookies'},
@@ -1956,6 +1959,12 @@ def kiwix_dashboard():
                            domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix')
 
 
+@app.route('/kiwix/scraper')
+def kiwix_scraper():
+    return render_template('kiwix/scraper.html',
+                           domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix/scraper')
+
+
 @app.route('/api/kiwix/sources')
 def api_kiwix_sources():
     """Serve pre-cached Kiwix sources data (never blocks)."""
diff --git a/static/css/recon.css b/static/css/recon.css
index 31d6306..a272876 100644
--- a/static/css/recon.css
+++ b/static/css/recon.css
@@ -331,3 +331,4 @@ tr:hover { background: var(--bg-secondary); }
 .badge-detected { background: #333; color: #888; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
 .badge-processing { background: #4a3a1a; color: #f59e0b; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
 .badge-extracting { background: #1a3a5a; color: #0ea5e9; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
+.badge-failed { background: #4a1a1a; color: #ff4444; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
diff --git a/static/js/scraper.js b/static/js/scraper.js
new file mode 100644
index 0000000..6aa23d7
--- /dev/null
+++ b/static/js/scraper.js
@@ -0,0 +1,155 @@
+/* RECON Scraper Dashboard JS */
+(function() {
+    'use strict';
+
+    function loadJobs() {
+        return RECON.fetchJSON('/api/scraper/jobs').then(function(data) {
+            var jobs = data.jobs || [];
+
+            // Stats
+            var total = jobs.length;
+            var active = 0, complete = 0, failed = 0;
+            jobs.forEach(function(j) {
+                if (j.status === 'complete') complete++;
+                else if (j.status === 'failed') failed++;
+                else if (j.status === 'running' || j.status === 'pending') active++;
+            });
+            RECON.set('sc-total', RECON.fmt(total));
+            RECON.set('sc-active', RECON.fmt(active));
+            RECON.set('sc-complete', RECON.fmt(complete));
+            RECON.set('sc-failed', RECON.fmt(failed));
+
+            // Table
+            var html = '';
+            jobs.forEach(function(j) {
+                var badge = statusBadge(j.status);
+                var mode = j.crawl_mode ?
+                    '<span class="text-small">' + j.crawl_mode + '</span>' : '<span class="text-muted">\u2014</span>';
+                var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
+                var zim = j.zim_filename ?
+                    '<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
+                var actions = '';
+
+                if (j.status === 'running' || j.status === 'pending') {
+                    actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
+                } else if (j.status === 'failed' || j.status === 'cancelled') {
+                    actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button>';
+                }
+
+                // Truncate URL for display
+                var displayUrl = j.url.length > 40 ? j.url.substring(0, 40) + '\u2026' : j.url;
+
+                html += '<tr>' +
+                    '<td>' + j.id + '</td>' +
+                    '<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
+                    '<td>' + escHtml(j.title || '\u2014') + '</td>' +
+                    '<td>' + mode + '</td>' +
+                    '<td>' + pages + '</td>' +
+                    '<td>' + badge + errorTooltip(j) + '</td>' +
+                    '<td>' + zim + '</td>' +
+                    '<td>' + actions + '</td>' +
+                    '</tr>';
+            });
+            if (!html) html = '<tr><td colspan="8" class="text-muted">No scrape jobs</td></tr>';
+            RECON.setHTML('sc-table-body', html);
+        }).catch(function(err) {
+            console.error('Scraper dashboard error:', err);
+        });
+    }
+
+    function statusBadge(status) {
+        var map = {
+            'pending': '<span class="badge-detected">PENDING</span>',
+            'running': '<span class="badge-processing">RUNNING</span>',
+            'complete': '<span class="badge-complete">COMPLETE</span>',
+            'failed': '<span class="badge-failed">FAILED</span>',
+            'cancelled': '<span class="badge-detected">CANCELLED</span>'
+        };
+        return map[status] || '<span class="badge-detected">' + (status || 'UNKNOWN').toUpperCase() + '</span>';
+    }
+
+    function errorTooltip(job) {
+        if (!job.error_message) return '';
+        var short = job.error_message.length > 80 ?
+            job.error_message.substring(0, 80) + '\u2026' : job.error_message;
+        return '<div class="text-small text-muted" style="max-width:200px;word-break:break-all;" title="' +
+            escHtml(job.error_message) + '">' + escHtml(short) + '</div>';
+    }
+
+    function escHtml(str) {
+        if (!str) return '';
+        return str.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
+                  .replace(/"/g, '&quot;').replace(/'/g, '&#39;');
+    }
+
+    function submit(e) {
+        e.preventDefault();
+        var url = document.getElementById('sf-url').value.trim();
+        if (!url) return false;
+
+        var body = { url: url };
+        var title = document.getElementById('sf-title').value.trim();
+        var lang = document.getElementById('sf-lang').value;
+        var category = document.getElementById('sf-category').value.trim();
+        var mode = document.getElementById('sf-mode').value;
+
+        if (title) body.title = title;
+        if (lang) body.language = lang;
+        if (category) body.category = category;
+        if (mode) body.crawl_mode = mode;
+
+        var btn = document.getElementById('sf-submit-btn');
+        var feedback = document.getElementById('sf-feedback');
+        btn.disabled = true;
+        btn.textContent = 'Submitting...';
+
+        RECON.postJSON('/api/scraper/submit', body).then(function(data) {
+            btn.disabled = false;
+            btn.textContent = 'Submit';
+            if (data.ok) {
+                feedback.style.display = 'block';
+                feedback.style.color = '#00ff41';
+                feedback.textContent = 'Job #' + data.job_id + ' submitted successfully';
+                document.getElementById('sf-url').value = '';
+                document.getElementById('sf-title').value = '';
+                document.getElementById('sf-category').value = '';
+                setTimeout(function() { feedback.style.display = 'none'; }, 4000);
+                loadJobs();
+            } else {
+                feedback.style.display = 'block';
+                feedback.style.color = '#ff4444';
+                feedback.textContent = 'Error: ' + (data.error || 'Unknown error');
+            }
+        }).catch(function(err) {
+            btn.disabled = false;
+            btn.textContent = 'Submit';
+            feedback.style.display = 'block';
+            feedback.style.color = '#ff4444';
+            feedback.textContent = 'Network error: ' + err.message;
+        });
+
+        return false;
+    }
+
+    function cancel(jobId) {
+        if (!confirm('Cancel job #' + jobId + '?')) return;
+        RECON.postJSON('/api/scraper/cancel/' + jobId).then(function(data) {
+            if (data.ok) loadJobs();
+            else alert('Error: ' + (data.error || 'Unknown'));
+        });
+    }
+
+    function retry(jobId) {
+        RECON.postJSON('/api/scraper/retry/' + jobId).then(function(data) {
+            if (data.ok) loadJobs();
+            else alert('Error: ' + (data.error || 'Unknown'));
+        });
+    }
+
+    // Expose for inline onclick
+    window.SCRAPER = { submit: submit, cancel: cancel, retry: retry };
+
+    document.addEventListener('DOMContentLoaded', function() {
+        RECON.startRefresh(loadJobs, 10000);
+    });
+})();
diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html
new file mode 100644
index 0000000..53d3e23
--- /dev/null
+++ b/templates/kiwix/scraper.html
@@ -0,0 +1,91 @@
+{% extends "base.html" %}
+{% block content %}
+<div id="scraper-page">
+    <!-- Submit Form -->
+    <div class="panel">
+        <h3 class="section-title" style="margin-bottom:12px;">Submit Scrape Job</h3>
+        <form id="scraper-form" onsubmit="return SCRAPER.submit(event)">
+            <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:12px;">
+                <div>
+                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">URL *</label>
+                    <input type="url" id="sf-url" placeholder="https://example.com/" required
+                           style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
+                </div>
+                <div>
+                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Title</label>
+                    <input type="text" id="sf-title" placeholder="Optional display title"
+                           style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
+                </div>
+            </div>
+            <div style="display:grid;grid-template-columns:1fr 1fr 1fr auto;gap:12px;align-items:end;">
+                <div>
+                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
+                    <select id="sf-lang"
+                            style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
+                        <option value="eng" selected>English</option>
+                        <option value="spa">Spanish</option>
+                        <option value="fra">French</option>
+                        <option value="deu">German</option>
+                        <option value="por">Portuguese</option>
+                        <option value="rus">Russian</option>
+                        <option value="jpn">Japanese</option>
+                        <option value="zho">Chinese</option>
+                        <option value="mul">Multilingual</option>
+                    </select>
+                </div>
+                <div>
+                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Category</label>
+                    <input type="text" id="sf-category" placeholder="Optional"
+                           style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
+                </div>
+                <div>
+                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Crawl Mode</label>
+                    <select id="sf-mode"
+                            style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
+                        <option value="" selected>Auto-detect</option>
+                        <option value="static">Static (wget)</option>
+                        <option value="browser">Browser (SingleFile)</option>
+                    </select>
+                </div>
+                <div>
+                    <button type="submit" class="btn" id="sf-submit-btn">Submit</button>
+                </div>
+            </div>
+            <div id="sf-feedback" style="margin-top:8px;font-size:12px;display:none;"></div>
+        </form>
+    </div>
+
+    <!-- Stats row -->
+    <div class="stat-grid" style="grid-template-columns:repeat(4, 1fr);">
+        <div class="stat-card"><div class="label">Total Jobs</div><div class="value" id="sc-total">&mdash;</div></div>
+        <div class="stat-card"><div class="label">Active</div><div class="value" id="sc-active">&mdash;</div></div>
+        <div class="stat-card"><div class="label">Complete</div><div class="value" id="sc-complete">&mdash;</div></div>
+        <div class="stat-card"><div class="label">Failed</div><div class="value" id="sc-failed">&mdash;</div></div>
+    </div>
+
+    <!-- Jobs Table -->
+    <div class="panel">
+        <h3 class="section-title" style="margin-bottom:12px;">Scrape Jobs</h3>
+        <table class="data-table" id="sc-table">
+            <thead>
+                <tr>
+                    <th>ID</th>
+                    <th>URL</th>
+                    <th>Title</th>
+                    <th>Mode</th>
+                    <th>Pages</th>
+                    <th>Status</th>
+                    <th>ZIM</th>
+                    <th></th>
+                </tr>
+            </thead>
+            <tbody id="sc-table-body">
+                <tr><td colspan="8" class="text-muted">Loading...</td></tr>
+            </tbody>
+        </table>
+    </div>
+</div>
+{% endblock %}
+{% block scripts %}
+<script src="/static/js/scraper.js"></script>
+{% endblock %}

From 45c3bb8d56d431e32fc8ecc5b57aa5cc65c488c2 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sat, 18 Apr 2026 21:03:39 +0000
Subject: [PATCH 05/11] Add scraper job queue management (delete, clear failed)

New API endpoints: DELETE single job, clear all failed/cancelled.
Dashboard now shows Delete buttons on completed/failed jobs,
Retry+Delete on failed jobs, and a Clear Failed bulk action.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/api.py                   | 30 ++++++++++++++++++++++++++++++
 static/js/scraper.js         | 29 ++++++++++++++++++++++++++---
 templates/kiwix/scraper.html |  5 ++++-
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/lib/api.py b/lib/api.py
index aa13a39..ce0381f 100644
--- a/lib/api.py
+++ b/lib/api.py
@@ -2373,6 +2373,36 @@ def api_scraper_retry(job_id):
     return jsonify({'ok': True})
 
 
+@app.route('/api/scraper/delete/<int:job_id>', methods=['POST'])
+def api_scraper_delete(job_id):
+    """Delete a scrape job (only if not currently running)."""
+    db = StatusDB()
+    job = db.get_scrape_job(job_id)
+    if not job:
+        return jsonify({'error': 'Job not found'}), 404
+
+    if job['status'] == 'running':
+        return jsonify({'error': 'Cannot delete a running job — cancel it first'}), 400
+
+    conn = db._get_conn()
+    conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,))
+    conn.commit()
+    logger.info(f"Scraper job {job_id} deleted")
+    return jsonify({'ok': True})
+
+
+@app.route('/api/scraper/clear-failed', methods=['POST'])
+def api_scraper_clear_failed():
+    """Delete all failed and cancelled scrape jobs."""
+    db = StatusDB()
+    conn = db._get_conn()
+    result = conn.execute("DELETE FROM scrape_jobs WHERE status IN ('failed', 'cancelled')")
+    conn.commit()
+    count = result.rowcount
+    logger.info(f"Cleared {count} failed/cancelled scraper jobs")
+    return jsonify({'ok': True, 'deleted': count})
+
+
 # ── Metrics API ──
 
 @app.route('/api/metrics/history')
diff --git a/static/js/scraper.js b/static/js/scraper.js
index 6aa23d7..49ce178 100644
--- a/static/js/scraper.js
+++ b/static/js/scraper.js
@@ -11,7 +11,7 @@
             var active = 0, complete = 0, failed = 0;
             jobs.forEach(function(j) {
                 if (j.status === 'complete') complete++;
-                else if (j.status === 'failed') failed++;
+                else if (j.status === 'failed' || j.status === 'cancelled') failed++;
                 else if (j.status === 'running' || j.status === 'pending') active++;
             });
             RECON.set('sc-total', RECON.fmt(total));
@@ -19,6 +19,10 @@
             RECON.set('sc-complete', RECON.fmt(complete));
             RECON.set('sc-failed', RECON.fmt(failed));
 
+            // Show/hide Clear Failed button
+            var clearBtn = document.getElementById('sc-clear-btn');
+            if (clearBtn) clearBtn.style.display = failed > 0 ? '' : 'none';
+
             // Table
             var html = '';
             jobs.forEach(function(j) {
@@ -33,7 +37,10 @@
                 if (j.status === 'running' || j.status === 'pending') {
                     actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
                 } else if (j.status === 'failed' || j.status === 'cancelled') {
-                    actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button>';
+                    actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
+                              '<button class="btn btn-danger" onclick="SCRAPER.remove(' + j.id + ')">Delete</button>';
+                } else if (j.status === 'complete') {
+                    actions = '<button class="btn btn-danger" onclick="SCRAPER.remove(' + j.id + ')">Delete</button>';
                 }
 
                 // Truncate URL for display
@@ -146,8 +153,24 @@
         });
     }
 
+    function remove(jobId) {
+        if (!confirm('Delete job #' + jobId + '? This cannot be undone.')) return;
+        RECON.postJSON('/api/scraper/delete/' + jobId).then(function(data) {
+            if (data.ok) loadJobs();
+            else alert('Error: ' + (data.error || 'Unknown'));
+        });
+    }
+
+    function clearFailed() {
+        if (!confirm('Delete all failed and cancelled jobs?')) return;
+        RECON.postJSON('/api/scraper/clear-failed').then(function(data) {
+            if (data.ok) loadJobs();
+            else alert('Error: ' + (data.error || 'Unknown'));
+        });
+    }
+
     // Expose for inline onclick
-    window.SCRAPER = { submit: submit, cancel: cancel, retry: retry };
+    window.SCRAPER = { submit: submit, cancel: cancel, retry: retry, remove: remove, clearFailed: clearFailed };
 
     document.addEventListener('DOMContentLoaded', function() {
         RECON.startRefresh(loadJobs, 10000);
diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html
index 53d3e23..3c42f43 100644
--- a/templates/kiwix/scraper.html
+++ b/templates/kiwix/scraper.html
@@ -65,7 +65,10 @@
 
     <!-- Jobs Table -->
     <div class="panel">
-        <h3 class="section-title" style="margin-bottom:12px;">Scrape Jobs</h3>
+        <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;">
+            <h3 class="section-title" style="margin:0;">Scrape Jobs</h3>
+            <button class="btn btn-danger" onclick="SCRAPER.clearFailed()" id="sc-clear-btn" style="display:none;">Clear Failed</button>
+        </div>
         <table class="data-table" id="sc-table">
             <thead>
                 <tr>

From f0b160ef7ca8fd0097e6cec91df27ad3701b953c Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 02:28:49 +0000
Subject: [PATCH 06/11] Extract _full_zim_cleanup helper, add SIGHUP +
 scrape_jobs cleanup

- Extract shared _full_zim_cleanup(source_id) from api_kiwix_remove
- Add SIGHUP to kiwix-serve after kiwix-manage remove
- Delete linked scrape_jobs rows during ZIM removal
- Update api_scraper_delete to do full ZIM cleanup when applicable
- Set chromium_path for single-file browser crawl support
- Add status.db to .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore  |  1 +
 config.yaml |  2 +-
 lib/api.py  | 96 +++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 87 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3fb01ef..bce13d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,4 @@ recon.db
 
 # Kiwix binary tools (installed from tarball)
 bin/
+status.db
diff --git a/config.yaml b/config.yaml
index bdabf69..082be93 100644
--- a/config.yaml
+++ b/config.yaml
@@ -487,7 +487,7 @@ scraper:
   # SingleFile CLI settings (browser crawl mode)
   singlefile:
     executable: single-file
-    chromium_path: ""              # Auto-detected from Playwright if empty
+    chromium_path: "/usr/bin/chromium-browser"
     crawl_max_depth: 10
 
 # Stream B: New Library Pipeline
diff --git a/lib/api.py b/lib/api.py
index ce0381f..b5cb8b5 100644
--- a/lib/api.py
+++ b/lib/api.py
@@ -2060,23 +2060,24 @@ def api_kiwix_upload():
 
 
 
-@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST'])
-def api_kiwix_remove(source_id):
-    """Remove a ZIM source: delete vectors, DB records, library entry, and file."""
+def _full_zim_cleanup(source_id):
+    """Full ZIM cleanup: Qdrant vectors, DB records, kiwix-manage, SIGHUP, file delete.
+    Returns dict with results. Caller handles cache refresh."""
     import subprocess
+    import signal
     import requests as req
 
     db = StatusDB()
     conn = db._get_conn()
     row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
     if not row:
-        return jsonify({'error': 'Source not found'}), 404
+        return None
 
     zim_source = dict(row)
     zim_filename = zim_source['zim_filename']
     zim_path = zim_source['zim_path']
     zim_title = zim_source.get('title', zim_filename)
-    results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False}
+    results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False, 'scrape_jobs_deleted': 0}
 
     # Step 1: Find all document hashes for this ZIM source
     doc_hashes = [r['hash'] for r in conn.execute(
@@ -2135,7 +2136,6 @@ def api_kiwix_remove(source_id):
 
     # Step 4: Remove from kiwix-serve library
     try:
-        # Get the book ID from library.xml
         subprocess.run(
             ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')],
             capture_output=True, text=True, timeout=10
@@ -2143,6 +2143,16 @@ def api_kiwix_remove(source_id):
     except Exception as e:
         logger.warning(f"kiwix-manage remove failed: {e}")
 
+    # Step 4b: SIGHUP kiwix-serve to reload library
+    try:
+        result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
+        if result.returncode == 0 and result.stdout.strip():
+            pid = int(result.stdout.strip().split()[0])
+            os.kill(pid, signal.SIGHUP)
+            logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
+    except Exception as e:
+        logger.warning(f"Failed to signal kiwix-serve: {e}")
+
     # Step 5: Delete the ZIM file
     if os.path.isfile(zim_path):
         try:
@@ -2152,13 +2162,37 @@ def api_kiwix_remove(source_id):
             logger.warning(f"ZIM file delete failed: {e}")
             results['file_deleted'] = False
 
+    # Step 6: Delete any linked scrape_jobs rows
+    try:
+        res = conn.execute("DELETE FROM scrape_jobs WHERE zim_source_id = ?", (source_id,))
+        conn.commit()
+        results['scrape_jobs_deleted'] = res.rowcount
+    except Exception as e:
+        logger.warning(f"scrape_jobs cleanup failed: {e}")
+
+    logger.info(f"Full ZIM cleanup for source {source_id} ('{zim_title}'): {results}")
+    return results
+
+
+@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST'])
+def api_kiwix_remove(source_id):
+    """Remove a ZIM source: delete vectors, DB records, library entry, and file."""
+    db = StatusDB()
+    conn = db._get_conn()
+    row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
+    if not row:
+        return jsonify({'error': 'Source not found'}), 404
+
+    results = _full_zim_cleanup(source_id)
+    if results is None:
+        return jsonify({'error': 'Source not found during cleanup'}), 404
+
     # Refresh cache
     try:
         _cache['kiwix_sources'] = _build_kiwix_sources()
     except Exception:
         pass
 
-    logger.info(f"Removed ZIM source '{zim_title}': {results}")
     return jsonify({'ok': True, 'results': results})
 
 
@@ -2375,20 +2409,60 @@ def api_scraper_retry(job_id):
 
 @app.route('/api/scraper/delete/<int:job_id>', methods=['POST'])
 def api_scraper_delete(job_id):
-    """Delete a scrape job (only if not currently running)."""
+    """Delete a scrape job and clean up any associated ZIM artifacts."""
+    import subprocess
+    import signal
+
     db = StatusDB()
     job = db.get_scrape_job(job_id)
     if not job:
         return jsonify({'error': 'Job not found'}), 404
 
     if job['status'] == 'running':
-        return jsonify({'error': 'Cannot delete a running job — cancel it first'}), 400
+        return jsonify({'error': 'Cannot delete a running job \u2014 cancel it first'}), 400
 
+    zim_cleanup_results = None
+
+    # If the job has a linked zim_source, do full cleanup
+    if job.get('zim_source_id'):
+        zim_cleanup_results = _full_zim_cleanup(job['zim_source_id'])
+        try:
+            _cache['kiwix_sources'] = _build_kiwix_sources()
+        except Exception:
+            pass
+    elif job.get('zim_filename'):
+        # No zim_source row, but there may be an orphan file + library entry
+        zim_path = os.path.join('/mnt/kiwix', job['zim_filename'])
+        if os.path.isfile(zim_path):
+            try:
+                os.remove(zim_path)
+                logger.info(f"Deleted orphan ZIM file: {zim_path}")
+            except Exception as e:
+                logger.warning(f"Failed to delete orphan ZIM file {zim_path}: {e}")
+            try:
+                subprocess.run(
+                    ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove',
+                     job['zim_filename'].replace('.zim', '')],
+                    capture_output=True, text=True, timeout=10
+                )
+            except Exception as e:
+                logger.warning(f"kiwix-manage remove failed for orphan: {e}")
+            try:
+                result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
+                if result.returncode == 0 and result.stdout.strip():
+                    pid = int(result.stdout.strip().split()[0])
+                    os.kill(pid, signal.SIGHUP)
+                    logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
+            except Exception as e:
+                logger.warning(f"Failed to signal kiwix-serve: {e}")
+
+    # Delete the scrape_jobs row (may already be gone if _full_zim_cleanup deleted it)
     conn = db._get_conn()
     conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,))
     conn.commit()
-    logger.info(f"Scraper job {job_id} deleted")
-    return jsonify({'ok': True})
+
+    logger.info(f"Scraper job {job_id} deleted (zim_cleanup={zim_cleanup_results})")
+    return jsonify({'ok': True, 'zim_cleanup': zim_cleanup_results})
 
 
 @app.route('/api/scraper/clear-failed', methods=['POST'])

From 8945c82e3f16f248d06314600135209659af1867 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 14:06:23 +0000
Subject: [PATCH 07/11] Replace wget/SingleFile/Playwright backends with Zimit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Zimit Docker container handles all site types (static, SPA, JS redirects)
- Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic
- Added: _crawl_zimit() with Docker lifecycle management
- Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done
- No more zimwriterfs step — Zimit produces ZIM directly
- Dashboard UI simplified: removed crawl mode dropdown
- Config simplified: removed reject patterns, preflight, singlefile sections

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 config.yaml                  |  75 +---
 lib/api.py                   |  67 ++--
 lib/scraper_runner.py        | 647 +++++++++--------------------------
 static/js/scraper.js         |  15 +-
 templates/kiwix/scraper.html |  14 +-
 5 files changed, 212 insertions(+), 606 deletions(-)

diff --git a/config.yaml b/config.yaml
index 082be93..a2709b0 100644
--- a/config.yaml
+++ b/config.yaml
@@ -414,81 +414,12 @@ peertube:
   poll_interval: 1800                    # Seconds between PeerTube acquisition polls (30 min)
 
 scraper:
-  workspace: /opt/recon/data/scraper      # Working directory for wget mirrors + ZIM builds
+  workspace: /opt/recon/data/scraper      # Working directory (tmp dirs for Zimit output)
   output_dir: /mnt/kiwix                  # Finished .zim files land here (kiwix-serve library)
-  rate_limit_delay: 0.5                   # Seconds between wget requests (--wait)
-  wait_random: 1.0                        # Random jitter added to wait (--random-wait range)
   default_language: eng                   # ISO 639-3 language code for ZIM metadata
-  user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
   poll_interval: 300                      # Seconds between checking for pending scrape jobs
-  keep_workspace_on_failure: true         # Retain workspace for debugging when a job fails
-
-  # Default URL patterns rejected by wget --reject-regex.
-  # Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
-  # Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
-  default_reject_patterns:
-    # WordPress
-    - '\?share='
-    - '\?replytocom='
-    - '\?like_comment='
-    - '/feed/'
-    - '/wp-json/'
-    - '/wp-login'
-    - '/wp-admin'
-    - '/wp-cron'
-    - '\?attachment_id='
-    - '/xmlrpc'
-    - '/trackback'
-    - '/comment-page-'
-    - '\?doing_wp_cron'
-    # Squarespace
-    - '\?format=json'
-    - '\?format=rss'
-    - '/api/'
-    # Wix
-    - '/_api/'
-    - '/_partials/'
-    # Ghost
-    - '/ghost/'
-    - '/p/'
-    # Drupal
-    - '\?q=comment'
-    - '\?q=node'
-    - '/user/login'
-    - '/user/register'
-    # General CMS / site chrome
-    - '/login'
-    - '/signup'
-    - '/register'
-    - '/cart'
-    - '/checkout'
-    - '/search\?'
-    - '/tag/'
-    - '/author/'
-    - '\?print='
-    - '\?pdf='
-    - '\?format=amp'
-    - '\?preview='
-    - '/rss'
-    - '/atom'
-    - '/cdn-cgi/'
-
-  # Pre-flight mode detection
-  preflight:
-    enabled: true
-    timeout: 30                    # Seconds for single-page Playwright fetch
-    min_static_size: 5120          # 5KB - wget HTML below this = suspect JS site
-    min_browser_size: 20480        # 20KB - browser HTML above this confirms JS
-    spa_markers:
-      - 'div#root'
-      - 'div#app'
-      - 'div#__next'
-
-  # SingleFile CLI settings (browser crawl mode)
-  singlefile:
-    executable: single-file
-    chromium_path: "/usr/bin/chromium-browser"
-    crawl_max_depth: 10
+  docker_image: ghcr.io/openzim/zimit     # Zimit Docker image for web crawling
+  docker_workers: 2                       # Concurrent crawl workers inside Zimit container
 
 # Stream B: New Library Pipeline
 new_pipeline:
diff --git a/lib/api.py b/lib/api.py
index b5cb8b5..6a3d627 100644
--- a/lib/api.py
+++ b/lib/api.py
@@ -44,6 +44,20 @@ app = Flask(__name__,
 
 app.config['MAX_CONTENT_LENGTH'] = None  # ZIM files can be multi-GB
 
+
+# ── Large ZIM upload support ──
+# Override stream factory so ZIM uploads write directly to /mnt/kiwix/
+# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs).
+from flask import Request as _FlaskRequest
+
+class _LargeZimRequest(_FlaskRequest):
+    def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None):
+        if filename and filename.lower().endswith('.zim'):
+            return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False)
+        return super()._get_file_stream(total_content_length, content_type, filename, content_length)
+
+app.request_class = _LargeZimRequest
+
 # ── Navigation Constants ──
 
 KNOWLEDGE_SUBNAV = [
@@ -2020,14 +2034,23 @@ def api_kiwix_upload():
 
     filename = secure_filename(f.filename)
     dest = os.path.join('/mnt/kiwix', filename)
-    tmp_dest = dest + '.tmp'
 
     try:
-        f.save(tmp_dest)
-        os.rename(tmp_dest, dest)
+        # Stream was written directly to /mnt/kiwix/ by _LargeZimRequest —
+        # rename in-place instead of copying 100GB+ through f.save()
+        if hasattr(f.stream, 'name') and f.stream.name:
+            tmp_path = f.stream.name
+            f.stream.close()
+            os.rename(tmp_path, dest)
+        else:
+            tmp_dest = dest + '.tmp'
+            f.save(tmp_dest)
+            os.rename(tmp_dest, dest)
     except Exception as e:
-        if os.path.exists(tmp_dest):
-            os.remove(tmp_dest)
+        # Clean up any temp files on failure
+        for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]:
+            if p and os.path.exists(p):
+                os.remove(p)
         return jsonify({'error': f'Save failed: {e}'}), 500
 
     # Register with kiwix-serve library
@@ -2320,24 +2343,11 @@ def api_scraper_submit():
     title = data.get('title', '').strip() or None
     category = data.get('category', '').strip() or None
 
-    # Optional per-job reject pattern overrides
-    additional_reject_patterns = data.get('additional_reject_patterns')
-    skip_default_patterns = bool(data.get('skip_default_patterns', False))
-
-    # Optional crawl mode override (static, browser, redirect, or null for auto-detect)
-    crawl_mode = data.get('crawl_mode')
-    if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'):
-        return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400
-
-    # Serialize additional patterns as JSON if provided
-    import json as _json
-    additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None
-
     db = StatusDB()
     conn = db._get_conn()
     conn.execute(
-        "INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)",
-        (url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode)
+        "INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)",
+        (url, title, language, category, 'zimit')
     )
     conn.commit()
     job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
@@ -2358,8 +2368,6 @@ def api_scraper_jobs():
 @app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
 def api_scraper_cancel(job_id):
     """Cancel a scrape job."""
-    import os as _os
-    import signal as _signal
 
     db = StatusDB()
     job = db.get_scrape_job(job_id)
@@ -2372,13 +2380,14 @@ def api_scraper_cancel(job_id):
     # Set cancelled in DB — the runner loop checks this between phases
     db.update_scrape_job(job_id, status='cancelled')
 
-    # If there's an active subprocess, send SIGTERM
-    pid = job.get('subprocess_pid')
-    if pid:
-        try:
-            _os.kill(pid, _signal.SIGTERM)
-        except (ProcessLookupError, PermissionError):
-            pass  # Process already gone
+    # Stop the Docker container if running
+    container_name = f'recon-scraper-{job_id}'
+    try:
+        import subprocess as _subprocess
+        _subprocess.run(['docker', 'rm', '-f', container_name],
+                        capture_output=True, timeout=10)
+    except Exception:
+        pass
 
     logger.info(f"Scraper job {job_id} cancelled")
     return jsonify({'ok': True})
diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index 280b874..f1e2efd 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -1,27 +1,21 @@
 """
 RECON Scraper Runner
 
-Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage.
-Supports two crawl backends:
-  - wget (static sites) — default
-  - SingleFile CLI (JS-rendered sites) — browser mode
-
-Pre-flight detection automatically chooses the right backend unless
-crawl_mode is pre-set on the job.
+Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage.
+Zimit (openZIM Docker crawler) handles all site types and produces ZIM
+files directly — no separate zimwriterfs step needed.
 
 Public entry point: scraper_loop(stop_event, config).
 
-Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile)
-DB table: scrape_jobs (status flow: pending → scraping → packaging → complete)
+Config section: scraper (output_dir, docker_image, docker_workers, poll_interval)
+DB table: scrape_jobs (status flow: pending → scraping → registering → complete)
 """
 import glob as _glob
-import json as _json
 import os
 import re
 import shutil
 import signal
 import subprocess
-import tempfile
 import time
 from datetime import datetime, timezone
 from urllib.parse import urlparse
@@ -39,6 +33,9 @@ def scraper_loop(stop_event, config):
 
     logger.info("Scraper runner started")
 
+    # Clean up any orphan Zimit containers from a previous crash
+    _cleanup_orphan_containers()
+
     while not stop_event.is_set():
         db = StatusDB()
         job = db.get_pending_scrape_job()
@@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5):
         proc.wait(timeout=2)
 
 
-def _count_html_files(directory):
-    """Count HTML files in a directory tree."""
-    count = 0
-    for root, dirs, files in os.walk(directory):
-        for f in files:
-            if f.lower().endswith(('.html', '.htm')):
-                count += 1
-    return count
-
-
-def _find_welcome_page(content_dir, domain):
-    """Find the welcome page (index.html) in the wget mirror."""
-    domain_dir = None
-    for entry in os.listdir(content_dir):
-        entry_path = os.path.join(content_dir, entry)
-        if os.path.isdir(entry_path):
-            domain_dir = entry_path
-            break
-
-    if not domain_dir:
-        return None, content_dir
-
-    for candidate in ['index.html', 'index.htm']:
-        path = os.path.join(domain_dir, candidate)
-        if os.path.isfile(path):
-            return candidate, domain_dir
-
-    for root, dirs, files in os.walk(domain_dir):
-        for f in sorted(files):
-            if f.lower().endswith(('.html', '.htm')):
-                rel = os.path.relpath(os.path.join(root, f), domain_dir)
-                return rel, domain_dir
-
-    return 'index.html', domain_dir
-
-
-def _create_placeholder_illustration(path):
-    """Create a 48x48 placeholder PNG for zimwriterfs --illustration."""
-    from PIL import Image
-    img = Image.new('RGB', (48, 48), color=(40, 192, 232))
-    img.save(path, 'PNG')
-
-
-# ── Crawl mode detection ──────────────────────────────────────────
-
-
-def _get_chromium_path(config):
-    """Auto-detect Chromium from Playwright's cache, or use config override."""
-    configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '')
-    if configured and os.path.isfile(configured):
-        return configured
-    # Playwright stores Chromium — check both root and user caches
-    search_paths = [
-        os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'),
-        '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome',
-    ]
-    for pattern in search_paths:
-        matches = sorted(_glob.glob(pattern))
-        if matches:
-            return matches[-1]
-    return None
-
-
-def _detect_crawl_mode(url, config):
-    """
-    Pre-flight detection: determine whether a URL needs a browser to crawl.
-
-    Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'.
-    'redirect' means the URL redirected to a different domain (parking page etc.);
-    resolved_url will be the final browser URL in that case.
-    """
-    preflight_cfg = config.get('scraper', {}).get('preflight', {})
-    if not preflight_cfg.get('enabled', True):
-        return 'static', url
-
-    timeout = preflight_cfg.get('timeout', 30)
-    min_static = preflight_cfg.get('min_static_size', 5120)
-    min_browser = preflight_cfg.get('min_browser_size', 20480)
-    spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next'])
-
-    input_domain = urlparse(url).hostname or ''
-    if input_domain.startswith('www.'):
-        input_domain = input_domain[4:]
-
-    # Step 1: wget single-page fetch
-    wget_html = ''
-    wget_size = 0
+def _cleanup_orphan_containers():
+    """Remove any leftover recon-scraper-* Docker containers from a previous crash."""
     try:
-        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp:
-            tmp_path = tmp.name
         result = subprocess.run(
-            ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url],
-            capture_output=True, text=True, timeout=timeout + 5
+            ['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'],
+            capture_output=True, text=True, timeout=10
         )
-        if os.path.isfile(tmp_path):
-            wget_size = os.path.getsize(tmp_path)
-            with open(tmp_path, 'r', errors='replace') as f:
-                wget_html = f.read()
-        os.unlink(tmp_path)
+        if result.returncode == 0 and result.stdout.strip():
+            for name in result.stdout.strip().split('\n'):
+                name = name.strip()
+                if name:
+                    subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10)
+                    logger.info(f"Cleaned up orphan container: {name}")
     except Exception as e:
-        logger.debug(f"Preflight wget failed for {url}: {e}")
-        try:
-            os.unlink(tmp_path)
-        except Exception:
-            pass
-
-    # Step 2: Playwright headless fetch
-    browser_html = ''
-    browser_size = 0
-    browser_url = url
-    try:
-        from playwright.sync_api import sync_playwright
-        with sync_playwright() as p:
-            browser = p.chromium.launch(
-                headless=True,
-                args=['--no-sandbox', '--disable-dev-shm-usage']
-            )
-            page = browser.new_page()
-            page.goto(url, wait_until='networkidle', timeout=timeout * 1000)
-            browser_url = page.url
-            browser_html = page.content()
-            browser_size = len(browser_html.encode('utf-8'))
-            browser.close()
-    except Exception as e:
-        logger.debug(f"Preflight Playwright failed for {url}: {e}")
-        # If Playwright fails entirely, fall back to static
-        return 'static', url
-
-    # Step 3: Decision logic
-    browser_domain = urlparse(browser_url).hostname or ''
-    if browser_domain.startswith('www.'):
-        browser_domain = browser_domain[4:]
-
-    # Check for cross-domain redirect (parking page detection)
-    if browser_domain and input_domain and browser_domain != input_domain:
-        logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect")
-        return 'redirect', browser_url
-
-    # Check size disparity: small wget + large browser = JS-rendered
-    if wget_size < min_static and browser_size > min_browser:
-        logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser")
-        return 'browser', url
-
-    # Check for SPA shell markers in wget HTML
-    if wget_html:
-        try:
-            from bs4 import BeautifulSoup
-            soup = BeautifulSoup(wget_html, 'html.parser')
-            for marker in spa_markers:
-                # marker is like 'div#root' — split tag and id
-                parts = marker.split('#', 1)
-                tag = parts[0] if parts[0] else 'div'
-                elem_id = parts[1] if len(parts) > 1 else None
-                elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag)
-                if elem:
-                    text_content = elem.get_text(strip=True)
-                    if len(text_content) < 100:
-                        logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser")
-                        return 'browser', url
-        except Exception as e:
-            logger.debug(f"Preflight SPA marker check failed: {e}")
-
-    logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static")
-    return 'static', url
+        logger.warning(f"Orphan container cleanup failed: {e}")
 
 
-# ── Crawl backends ────────────────────────────────────────────────
+# ── Zimit crawl backend ──────────────────────────────────────────
 
 
-def _crawl_wget(job, url, site_dir, config, stop_event, db):
+def _crawl_zimit(job, config, stop_event, db):
     """
-    wget mirror crawl backend.
-    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
+    Crawl a URL using Zimit (openZIM Docker crawler).
+
+    Returns (page_count, zim_filename, error_msg).
+    On success: (count, filename, None)
+    On failure: (0, None, error_string)
     """
     job_id = job['id']
+    url = job['url']
+    title = job.get('title') or _sanitize_domain(url)
+    language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
+    category = job.get('category') or ''
+
     scraper_cfg = config.get('scraper', {})
-    rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5)
-    user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)')
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
-    workspace = os.path.dirname(site_dir)
+    output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
+    docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit')
+    docker_workers = scraper_cfg.get('docker_workers', 2)
 
-    # Build reject-regex from config defaults + per-job overrides
-    reject_patterns = []
-    skip_defaults = bool(job.get('skip_default_patterns'))
-    if not skip_defaults:
-        reject_patterns.extend(scraper_cfg.get('default_reject_patterns', []))
-    additional_raw = job.get('additional_reject_patterns')
-    if additional_raw:
-        try:
-            additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw
-            if isinstance(additional, list):
-                reject_patterns.extend(additional)
-        except (ValueError, TypeError):
-            pass
-
-    wget_cmd = [
-        'wget', '--mirror', '--convert-links', '--adjust-extension',
-        '--page-requisites', '--no-parent',
-        '--restrict-file-names=windows',
-        f'--wait={rate_limit_delay}', '--random-wait',
-        f'--user-agent={user_agent}',
-        f'--directory-prefix={site_dir}',
-        '--timeout=30', '--tries=3',
-    ]
-    if reject_patterns:
-        combined_regex = '|'.join(f'({p})' for p in reject_patterns)
-        wget_cmd.extend([f'--reject-regex={combined_regex}'])
-        logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns")
-    wget_cmd.append(url)
-
-    logger.info(f"Job {job_id}: wget mirror starting")
-    wget_log = os.path.join(workspace, 'wget.log')
-    try:
-        with open(wget_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                wget_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
-        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
-
-        while proc.poll() is None:
-            if stop_event.is_set() or _check_cancelled(db, job_id):
-                _kill_process(proc)
-                return 0, 'cancelled'
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                pass
-
-        db.update_scrape_job(job_id, subprocess_pid=None)
-
-        if stop_event.is_set() or _check_cancelled(db, job_id):
-            return 0, 'cancelled'
-
-        # wget returns 8 for some server errors but may still have useful content
-        if proc.returncode not in (0, 4, 6, 8):
-            output = ''
-            try:
-                with open(wget_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(wget_log) - 500))
-                    output = f.read()
-            except Exception:
-                pass
-            return 0, f"wget failed with code {proc.returncode}: {output[-500:]}"
-
-    except Exception as e:
-        return 0, f"wget error: {e}"
-
-    page_count = _count_html_files(site_dir)
-    logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found")
-
-    if page_count == 0:
-        return 0, 'wget produced no HTML files'
-
-    return page_count, None
-
-
-def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
-    """
-    SingleFile CLI crawl backend for JS-rendered sites.
-    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
-    """
-    job_id = job['id']
-    scraper_cfg = config.get('scraper', {})
-    sf_cfg = scraper_cfg.get('singlefile', {})
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
-    workspace = os.path.dirname(site_dir)
-
-    executable = sf_cfg.get('executable', 'single-file')
-    chromium_path = _get_chromium_path(config)
-    crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
-
-    if not chromium_path:
-        return 0, 'Chromium not found — cannot use browser crawl mode'
-
-    # SingleFile outputs into site_dir/<domain>/ to match wget's structure
     domain = _sanitize_domain(url)
-    output_dir = os.path.join(site_dir, domain)
-    os.makedirs(output_dir, exist_ok=True)
+    date_tag = datetime.now().strftime('%Y-%m')
+    container_name = f'recon-scraper-{job_id}'
+    tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
 
-    sf_cmd = [
-        executable,
-        '--crawl-links=true',
-        '--crawl-inner-links-only=true',
-        '--crawl-no-parent=true',
-        '--crawl-replace-URLs=true',
-        f'--crawl-max-depth={crawl_max_depth}',
-        f'--browser-executable-path={chromium_path}',
-        '--browser-headless=true',
-        '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
-        f'--output-directory={output_dir}',
-        url,
+    # Clean up any pre-existing container with same name (retry scenario)
+    subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10)
+
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    description = f"Mirror of {domain}"
+    if category:
+        description = f"{category} — mirror of {domain}"
+
+    docker_cmd = [
+        'docker', 'run', '--rm',
+        '--name', container_name,
+        '-v', f'{tmp_dir}:/output',
+        docker_image,
+        '--url', url,
+        '--name', _sanitize_filename(domain),
+        '--lang', language,
+        '--title', title,
+        '--description', description[:80],
+        '--output', '/output',
+        '--workers', str(docker_workers),
     ]
 
-    logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})")
-    sf_log = os.path.join(workspace, 'singlefile.log')
+    logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
     try:
-        with open(sf_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                sf_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
+        proc = subprocess.Popen(
+            docker_cmd,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
         db.update_scrape_job(job_id, subprocess_pid=proc.pid)
 
+        last_progress_check = 0
         while proc.poll() is None:
             if stop_event.is_set() or _check_cancelled(db, job_id):
+                # Stop the Docker container
+                subprocess.run(['docker', 'rm', '-f', container_name],
+                               capture_output=True, timeout=10)
                 _kill_process(proc)
-                return 0, 'cancelled'
+                shutil.rmtree(tmp_dir, ignore_errors=True)
+                return 0, None, 'cancelled'
+
+            # Check progress every 30s via docker logs
+            now = time.time()
+            if now - last_progress_check >= 30:
+                last_progress_check = now
+                try:
+                    log_result = subprocess.run(
+                        ['docker', 'logs', '--tail', '20', container_name],
+                        capture_output=True, text=True, timeout=10
+                    )
+                    if log_result.returncode == 0 and log_result.stderr:
+                        # Zimit/Browsertrix logs page counts — look for numbers
+                        lines = log_result.stderr.strip().split('\n')
+                        for line in reversed(lines):
+                            # Look for patterns like "X pages" or page count indicators
+                            match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
+                            if match:
+                                count = int(match.group(1))
+                                if count > 0:
+                                    db.update_scrape_job(job_id, page_count=count)
+                                break
+                except Exception:
+                    pass
+
             try:
                 proc.wait(timeout=5)
             except subprocess.TimeoutExpired:
@@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
         db.update_scrape_job(job_id, subprocess_pid=None)
 
         if stop_event.is_set() or _check_cancelled(db, job_id):
-            return 0, 'cancelled'
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+            return 0, None, 'cancelled'
 
         if proc.returncode != 0:
-            output = ''
+            # Capture last 50 lines of docker logs for error context
+            error_msg = f"Zimit exited with code {proc.returncode}"
             try:
-                with open(sf_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(sf_log) - 500))
-                    output = f.read()
+                log_result = subprocess.run(
+                    ['docker', 'logs', '--tail', '50', container_name],
+                    capture_output=True, text=True, timeout=10
+                )
+                log_text = (log_result.stderr or log_result.stdout or '').strip()
+                if log_text:
+                    # Take last 500 chars
+                    error_msg += f": {log_text[-500:]}"
             except Exception:
                 pass
-            # SingleFile may still produce some files even with non-zero exit
-            page_count = _count_html_files(site_dir)
-            if page_count == 0:
-                return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}"
-            logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing")
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+            return 0, None, error_msg
 
     except Exception as e:
-        return 0, f"SingleFile error: {e}"
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, f"Zimit error: {e}"
 
-    # If no index.html exists, rename the first HTML file to index.html
-    index_path = os.path.join(output_dir, 'index.html')
-    if not os.path.isfile(index_path):
-        for f in sorted(os.listdir(output_dir)):
-            if f.lower().endswith(('.html', '.htm')):
-                src = os.path.join(output_dir, f)
-                os.rename(src, index_path)
-                logger.info(f"Job {job_id}: renamed {f} → index.html")
-                break
+    # Find the output ZIM file
+    zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
+    if not zim_files:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, 'Zimit produced no ZIM file'
 
-    page_count = _count_html_files(site_dir)
-    logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found")
+    src_zim = zim_files[0]  # Should be exactly one
 
-    if page_count == 0:
-        return 0, 'SingleFile produced no HTML files'
+    # Get page count from file size as rough estimate if we don't have one
+    page_count = 0
+    try:
+        job_state = db.get_scrape_job(job_id)
+        page_count = job_state.get('page_count') or 0
+    except Exception:
+        pass
 
-    return page_count, None
+    # Rename to final location
+    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
+    zim_path = os.path.join(output_dir, zim_filename)
+    try:
+        shutil.move(src_zim, zim_path)
+    except Exception as e:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, f"Failed to move ZIM to output dir: {e}"
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+    logger.info(f"Job {job_id}: Zimit complete — {zim_filename}")
+
+    return page_count, zim_filename, None
 
 
 # ── Main job pipeline ─────────────────────────────────────────────
@@ -458,183 +273,43 @@ def _process_job(job, config, stop_event):
     """Execute the full scrape pipeline for a single job."""
     db = StatusDB()
     job_id = job['id']
-    url = job['url']
-    title = job.get('title') or _sanitize_domain(url)
-    language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
-    category = job.get('category') or ''
 
-    scraper_cfg = config.get('scraper', {})
-    workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper')
-    output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
+    logger.info(f"Job {job_id}: starting scrape of {job['url']}")
 
-    workspace = os.path.join(workspace_root, str(job_id))
-    site_dir = os.path.join(workspace, 'site')
-    os.makedirs(site_dir, exist_ok=True)
-
-    domain = _sanitize_domain(url)
-    date_tag = datetime.now().strftime('%Y-%m')
-    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
-    zim_path = os.path.join(output_dir, zim_filename)
-
-    logger.info(f"Job {job_id}: starting scrape of {url}")
+    # ── Phase 1: Crawl via Zimit ───────────────────────────────────
     db.update_scrape_job(job_id,
                          status='scraping',
-                         workspace_path=workspace,
+                         crawl_mode='zimit',
                          started_at=_now())
 
-    # ── Phase 0: Pre-flight mode detection ─────────────────────────
     if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
         return
 
-    pre_set = job.get('crawl_mode')
-    if pre_set:
-        crawl_mode, resolved_url = pre_set, url
-        logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}")
-    else:
-        crawl_mode, resolved_url = _detect_crawl_mode(url, config)
-        logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}")
-
-    db.update_scrape_job(job_id, crawl_mode=crawl_mode)
-
-    # If redirect detected, update domain/filename to match resolved URL
-    if crawl_mode == 'redirect' and resolved_url != url:
-        logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
-        domain = _sanitize_domain(resolved_url)
-        zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
-        zim_path = os.path.join(output_dir, zim_filename)
-
-    # ── Phase A: Crawl (dispatch to backend) ────────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
-
-    if crawl_mode == 'browser':
-        page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db)
-    else:  # 'static' or 'redirect'
-        page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db)
+    page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db)
 
     if error == 'cancelled':
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
         return
     elif error:
         db.update_scrape_job(job_id,
                              status='failed',
-                             error_message=error,
+                             error_message=error[:1000],
                              subprocess_pid=None,
                              completed_at=_now())
-        if not keep_workspace:
-            shutil.rmtree(workspace, ignore_errors=True)
         return
 
     db.update_scrape_job(job_id, page_count=page_count)
 
-    # ── Phase B: Prepare zimwriterfs inputs ────────────────────────
+    # ── Phase 2: Register with kiwix-serve ─────────────────────────
     if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
         return
 
-    welcome_page, content_dir = _find_welcome_page(site_dir, domain)
-    if welcome_page is None:
-        welcome_page = 'index.html'
-
-    illustration_path = os.path.join(workspace, 'illustration.png')
-    _create_placeholder_illustration(illustration_path)
-    illust_dest = os.path.join(content_dir, 'illustration.png')
-    shutil.copy2(illustration_path, illust_dest)
-
-    description = f"Mirror of {domain}"
-    if category:
-        description = f"{category} — mirror of {domain}"
-
-    logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})")
-    db.update_scrape_job(job_id, status='packaging')
-
-    # ── Phase C: zimwriterfs ───────────────────────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
-
-    zim_name = _sanitize_filename(domain)
-    long_description = f"Offline mirror of {resolved_url} created by RECON web scraper"
-
-    zim_cmd = [
-        'zimwriterfs',
-        f'--welcome={welcome_page}',
-        f'--illustration=illustration.png',
-        f'--language={language}',
-        f'--title={title}',
-        f'--description={description[:80]}',
-        f'--longDescription={long_description[:4096]}',
-        f'--name={zim_name}',
-        f'--creator={domain}',
-        '--publisher=RECON',
-        content_dir,
-        zim_path,
-    ]
-
-    zim_log = os.path.join(workspace, 'zimwriterfs.log')
-    try:
-        with open(zim_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                zim_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
-        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
-
-        while proc.poll() is None:
-            if stop_event.is_set() or _check_cancelled(db, job_id):
-                _kill_process(proc)
-                _handle_cancel(db, job_id, workspace, keep_workspace)
-                return
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                pass
-
-        db.update_scrape_job(job_id, subprocess_pid=None)
-
-        if stop_event.is_set() or _check_cancelled(db, job_id):
-            _handle_cancel(db, job_id, workspace, keep_workspace)
-            return
-
-        if proc.returncode != 0:
-            output = ''
-            try:
-                with open(zim_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(zim_log) - 500))
-                    output = f.read()
-            except Exception:
-                pass
-            raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}")
-
-    except RuntimeError:
-        raise
-    except Exception as e:
-        db.update_scrape_job(job_id,
-                             status='failed',
-                             error_message=f"zimwriterfs error: {e}",
-                             subprocess_pid=None,
-                             completed_at=_now())
-        if not keep_workspace:
-            shutil.rmtree(workspace, ignore_errors=True)
-        return
-
-    if not os.path.isfile(zim_path):
-        db.update_scrape_job(job_id,
-                             status='failed',
-                             error_message='zimwriterfs produced no output file',
-                             completed_at=_now())
-        return
-
-    logger.info(f"Job {job_id}: ZIM created at {zim_path}")
-
-    # ── Phase D: kiwix-manage + registration ───────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
+    db.update_scrape_job(job_id, status='registering')
 
+    output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix')
+    zim_path = os.path.join(output_dir, zim_filename)
     kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
     library_xml = '/mnt/kiwix/library.xml'
 
@@ -670,26 +345,32 @@ def _process_job(job, config, stop_event):
     except Exception as e:
         logger.warning(f"Job {job_id}: scan_zims failed: {e}")
 
-    try:
-        shutil.rmtree(workspace, ignore_errors=True)
-    except Exception:
-        pass
-
+    # ── Phase 3: Complete ──────────────────────────────────────────
     db.update_scrape_job(job_id,
                          status='complete',
                          zim_filename=zim_filename,
                          zim_source_id=zim_source_id,
                          completed_at=_now())
 
-    logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})")
+    logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)")
 
 
-def _handle_cancel(db, job_id, workspace, keep_workspace):
-    """Handle job cancellation: clean up and update status."""
+def _handle_cancel(db, job_id):
+    """Handle job cancellation: clean up Docker container and update status."""
+    container_name = f'recon-scraper-{job_id}'
+    try:
+        subprocess.run(['docker', 'rm', '-f', container_name],
+                       capture_output=True, timeout=10)
+    except Exception:
+        pass
+
+    # Clean up tmp dir if it exists
+    output_dir = '/mnt/kiwix'
+    tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
     logger.info(f"Job {job_id}: cancelled")
     db.update_scrape_job(job_id,
                          status='cancelled',
                          subprocess_pid=None,
                          completed_at=_now())
-    if not keep_workspace:
-        shutil.rmtree(workspace, ignore_errors=True)
diff --git a/static/js/scraper.js b/static/js/scraper.js
index 49ce178..3988ffe 100644
--- a/static/js/scraper.js
+++ b/static/js/scraper.js
@@ -12,7 +12,7 @@
             jobs.forEach(function(j) {
                 if (j.status === 'complete') complete++;
                 else if (j.status === 'failed' || j.status === 'cancelled') failed++;
-                else if (j.status === 'running' || j.status === 'pending') active++;
+                else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++;
             });
             RECON.set('sc-total', RECON.fmt(total));
             RECON.set('sc-active', RECON.fmt(active));
@@ -27,14 +27,12 @@
             var html = '';
             jobs.forEach(function(j) {
                 var badge = statusBadge(j.status);
-                var mode = j.crawl_mode ?
-                    '<span class="text-small">' + j.crawl_mode + '</span>' : '<span class="text-muted">\u2014</span>';
                 var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
                 var zim = j.zim_filename ?
                     '<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
                 var actions = '';
 
-                if (j.status === 'running' || j.status === 'pending') {
+                if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') {
                     actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
                 } else if (j.status === 'failed' || j.status === 'cancelled') {
                     actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
@@ -50,14 +48,13 @@
                     '<td>' + j.id + '</td>' +
                     '<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
                     '<td>' + escHtml(j.title || '\u2014') + '</td>' +
-                    '<td>' + mode + '</td>' +
                     '<td>' + pages + '</td>' +
                     '<td>' + badge + errorTooltip(j) + '</td>' +
                     '<td>' + zim + '</td>' +
                     '<td>' + actions + '</td>' +
                     '</tr>';
             });
-            if (!html) html = '<tr><td colspan="8" class="text-muted">No scrape jobs</td></tr>';
+            if (!html) html = '<tr><td colspan="7" class="text-muted">No scrape jobs</td></tr>';
             RECON.setHTML('sc-table-body', html);
         }).catch(function(err) {
             console.error('Scraper dashboard error:', err);
@@ -67,7 +64,8 @@
     function statusBadge(status) {
         var map = {
             'pending': '<span class="badge-detected">PENDING</span>',
-            'running': '<span class="badge-processing">RUNNING</span>',
+            'scraping': '<span class="badge-processing">SCRAPING</span>',
+            'registering': '<span class="badge-processing">REGISTERING</span>',
             'complete': '<span class="badge-complete">COMPLETE</span>',
             'failed': '<span class="badge-failed">FAILED</span>',
             'cancelled': '<span class="badge-detected">CANCELLED</span>'
@@ -98,12 +96,9 @@
         var title = document.getElementById('sf-title').value.trim();
         var lang = document.getElementById('sf-lang').value;
         var category = document.getElementById('sf-category').value.trim();
-        var mode = document.getElementById('sf-mode').value;
-
         if (title) body.title = title;
         if (lang) body.language = lang;
         if (category) body.category = category;
-        if (mode) body.crawl_mode = mode;
 
         var btn = document.getElementById('sf-submit-btn');
         var feedback = document.getElementById('sf-feedback');
diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html
index 3c42f43..862ba0a 100644
--- a/templates/kiwix/scraper.html
+++ b/templates/kiwix/scraper.html
@@ -17,7 +17,7 @@
                            style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
                 </div>
             </div>
-            <div style="display:grid;grid-template-columns:1fr 1fr 1fr auto;gap:12px;align-items:end;">
+            <div style="display:grid;grid-template-columns:1fr 1fr auto;gap:12px;align-items:end;">
                 <div>
                     <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
                     <select id="sf-lang"
@@ -38,15 +38,6 @@
                     <input type="text" id="sf-category" placeholder="Optional"
                            style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
                 </div>
-                <div>
-                    <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Crawl Mode</label>
-                    <select id="sf-mode"
-                            style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
-                        <option value="" selected>Auto-detect</option>
-                        <option value="static">Static (wget)</option>
-                        <option value="browser">Browser (SingleFile)</option>
-                    </select>
-                </div>
                 <div>
                     <button type="submit" class="btn" id="sf-submit-btn">Submit</button>
                 </div>
@@ -75,7 +66,6 @@
                     <th>ID</th>
                     <th>URL</th>
                     <th>Title</th>
-                    <th>Mode</th>
                     <th>Pages</th>
                     <th>Status</th>
                     <th>ZIM</th>
@@ -83,7 +73,7 @@
                 </tr>
             </thead>
             <tbody id="sc-table-body">
-                <tr><td colspan="8" class="text-muted">Loading...</td></tr>
+                <tr><td colspan="7" class="text-muted">Loading...</td></tr>
             </tbody>
         </table>
     </div>

From 76076fc4ab87d6c7dec7bde564435b703b82cd7c Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 14:13:34 +0000
Subject: [PATCH 08/11] Fix Zimit CLI: add subcommand, correct flag names, fix
 container cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Must pass `zimit` as command after image name (entrypoint execs args)
- --url → --seeds, --name removed, --lang → --zim-lang, --workers → -w
- Remove --rm so docker logs work after exit, manually rm container

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index f1e2efd..9658be5 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -148,17 +148,17 @@ def _crawl_zimit(job, config, stop_event, db):
         description = f"{category} — mirror of {domain}"
 
     docker_cmd = [
-        'docker', 'run', '--rm',
+        'docker', 'run',
         '--name', container_name,
         '-v', f'{tmp_dir}:/output',
         docker_image,
-        '--url', url,
-        '--name', _sanitize_filename(domain),
-        '--lang', language,
+        'zimit',
+        '--seeds', url,
+        '--zim-lang', language,
         '--title', title,
         '--description', description[:80],
         '--output', '/output',
-        '--workers', str(docker_workers),
+        '-w', str(docker_workers),
     ]
 
     logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
@@ -228,6 +228,9 @@ def _crawl_zimit(job, config, stop_event, db):
                     error_msg += f": {log_text[-500:]}"
             except Exception:
                 pass
+            # Remove container (no --rm flag, so we clean up manually)
+            subprocess.run(['docker', 'rm', '-f', container_name],
+                           capture_output=True, timeout=10)
             shutil.rmtree(tmp_dir, ignore_errors=True)
             return 0, None, error_msg
 
@@ -235,6 +238,10 @@ def _crawl_zimit(job, config, stop_event, db):
         shutil.rmtree(tmp_dir, ignore_errors=True)
         return 0, None, f"Zimit error: {e}"
 
+    # Remove container (no --rm flag, so we clean up manually after getting logs)
+    subprocess.run(['docker', 'rm', '-f', container_name],
+                   capture_output=True, timeout=10)
+
     # Find the output ZIM file
     zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
     if not zim_files:

From b035ba3f203b3167259415c6261af0b555859ff7 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 14:30:42 +0000
Subject: [PATCH 09/11] Fix Zimit: add required --name flag for warc2zim

warc2zim (called internally by zimit) requires --name for ZIM metadata.
Without it, argument validation fails with exit code 2.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index 9658be5..eb50695 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -154,6 +154,7 @@ def _crawl_zimit(job, config, stop_event, db):
         docker_image,
         'zimit',
         '--seeds', url,
+        '--name', _sanitize_filename(domain),
         '--zim-lang', language,
         '--title', title,
         '--description', description[:80],

From 96920447900037310efcc0a30cb45d1a1c7effe5 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 19:33:50 +0000
Subject: [PATCH 10/11] Fix progress parsing for Browsertrix JSON log format

Parse "crawled":N from Browsertrix crawlStatus JSON logs instead of
looking for "N pages" pattern. Also check stdout (not just stderr).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index eb50695..d6b0299 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -190,12 +190,12 @@ def _crawl_zimit(job, config, stop_event, db):
                         ['docker', 'logs', '--tail', '20', container_name],
                         capture_output=True, text=True, timeout=10
                     )
-                    if log_result.returncode == 0 and log_result.stderr:
-                        # Zimit/Browsertrix logs page counts — look for numbers
-                        lines = log_result.stderr.strip().split('\n')
+                    if log_result.returncode == 0:
+                        # Browsertrix logs JSON with "crawled":N — check both stdout and stderr
+                        log_text = log_result.stdout or log_result.stderr or ''
+                        lines = log_text.strip().split('\n')
                         for line in reversed(lines):
-                            # Look for patterns like "X pages" or page count indicators
-                            match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
+                            match = re.search(r'"crawled":(\d+)', line)
                             if match:
                                 count = int(match.group(1))
                                 if count > 0:

From 5f5bcedab986b7b93b42d30e325feaca7a5ba214 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 19:35:42 +0000
Subject: [PATCH 11/11] Fix progress regex and SIGHUP/scan_zims race condition

- Parse Browsertrix "crawled":N JSON format instead of "N pages"
- Add 3s delay between SIGHUP to kiwix-serve and scan_zims() call
  so the OPDS catalog is reloaded before we query it for linking

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index d6b0299..b83f145 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -339,6 +339,9 @@ def _process_job(job, config, stop_event):
     except Exception as e:
         logger.warning(f"Job {job_id}: failed to signal kiwix-serve: {e}")
 
+    # Wait for kiwix-serve to reload its catalog after SIGHUP
+    time.sleep(3)
+
     zim_source_id = None
     try:
         from .zim_monitor import scan_zims