Replace wget/SingleFile/Playwright backends with Zimit

- Zimit Docker container handles all site types (static, SPA, JS redirects) - Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic - Added: _crawl_zimit() with Docker lifecycle management - Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done - No more zimwriterfs step — Zimit produces ZIM directly - Dashboard UI simplified: removed crawl mode dropdown - Config simplified: removed reject patterns, preflight, singlefile sections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-19 14:06:23 +00:00 · 2026-04-19 14:06:23 +00:00 · 8945c82e3f
commit 8945c82e3f
parent f0b160ef7c
5 changed files with 212 additions and 606 deletions
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@ -1,27 +1,21 @@
 """
 RECON Scraper Runner

-Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage.
-Supports two crawl backends:
-  - wget (static sites) — default
-  - SingleFile CLI (JS-rendered sites) — browser mode
-
-Pre-flight detection automatically chooses the right backend unless
-crawl_mode is pre-set on the job.
+Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage.
+Zimit (openZIM Docker crawler) handles all site types and produces ZIM
+files directly — no separate zimwriterfs step needed.

 Public entry point: scraper_loop(stop_event, config).

-Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile)
-DB table: scrape_jobs (status flow: pending → scraping → packaging → complete)
+Config section: scraper (output_dir, docker_image, docker_workers, poll_interval)
+DB table: scrape_jobs (status flow: pending → scraping → registering → complete)
 """
 import glob as _glob
-import json as _json
 import os
 import re
 import shutil
 import signal
 import subprocess
-import tempfile
 import time
 from datetime import datetime, timezone
 from urllib.parse import urlparse
@ -39,6 +33,9 @@ def scraper_loop(stop_event, config):

    logger.info("Scraper runner started")

+    # Clean up any orphan Zimit containers from a previous crash
+    _cleanup_orphan_containers()
+
    while not stop_event.is_set():
        db = StatusDB()
        job = db.get_pending_scrape_job()
@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5):
        proc.wait(timeout=2)


-def _count_html_files(directory):
-    """Count HTML files in a directory tree."""
-    count = 0
-    for root, dirs, files in os.walk(directory):
-        for f in files:
-            if f.lower().endswith(('.html', '.htm')):
-                count += 1
-    return count
-
-
-def _find_welcome_page(content_dir, domain):
-    """Find the welcome page (index.html) in the wget mirror."""
-    domain_dir = None
-    for entry in os.listdir(content_dir):
-        entry_path = os.path.join(content_dir, entry)
-        if os.path.isdir(entry_path):
-            domain_dir = entry_path
-            break
-
-    if not domain_dir:
-        return None, content_dir
-
-    for candidate in ['index.html', 'index.htm']:
-        path = os.path.join(domain_dir, candidate)
-        if os.path.isfile(path):
-            return candidate, domain_dir
-
-    for root, dirs, files in os.walk(domain_dir):
-        for f in sorted(files):
-            if f.lower().endswith(('.html', '.htm')):
-                rel = os.path.relpath(os.path.join(root, f), domain_dir)
-                return rel, domain_dir
-
-    return 'index.html', domain_dir
-
-
-def _create_placeholder_illustration(path):
-    """Create a 48x48 placeholder PNG for zimwriterfs --illustration."""
-    from PIL import Image
-    img = Image.new('RGB', (48, 48), color=(40, 192, 232))
-    img.save(path, 'PNG')
-
-
-# ── Crawl mode detection ──────────────────────────────────────────
-
-
-def _get_chromium_path(config):
-    """Auto-detect Chromium from Playwright's cache, or use config override."""
-    configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '')
-    if configured and os.path.isfile(configured):
-        return configured
-    # Playwright stores Chromium — check both root and user caches
-    search_paths = [
-        os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'),
-        '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome',
-    ]
-    for pattern in search_paths:
-        matches = sorted(_glob.glob(pattern))
-        if matches:
-            return matches[-1]
-    return None
-
-
-def _detect_crawl_mode(url, config):
-    """
-    Pre-flight detection: determine whether a URL needs a browser to crawl.
-
-    Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'.
-    'redirect' means the URL redirected to a different domain (parking page etc.);
-    resolved_url will be the final browser URL in that case.
-    """
-    preflight_cfg = config.get('scraper', {}).get('preflight', {})
-    if not preflight_cfg.get('enabled', True):
-        return 'static', url
-
-    timeout = preflight_cfg.get('timeout', 30)
-    min_static = preflight_cfg.get('min_static_size', 5120)
-    min_browser = preflight_cfg.get('min_browser_size', 20480)
-    spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next'])
-
-    input_domain = urlparse(url).hostname or ''
-    if input_domain.startswith('www.'):
-        input_domain = input_domain[4:]
-
-    # Step 1: wget single-page fetch
-    wget_html = ''
-    wget_size = 0
+def _cleanup_orphan_containers():
+    """Remove any leftover recon-scraper-* Docker containers from a previous crash."""
    try:
-        with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp:
-            tmp_path = tmp.name
        result = subprocess.run(
-            ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url],
-            capture_output=True, text=True, timeout=timeout + 5
+            ['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'],
+            capture_output=True, text=True, timeout=10
        )
-        if os.path.isfile(tmp_path):
-            wget_size = os.path.getsize(tmp_path)
-            with open(tmp_path, 'r', errors='replace') as f:
-                wget_html = f.read()
-        os.unlink(tmp_path)
+        if result.returncode == 0 and result.stdout.strip():
+            for name in result.stdout.strip().split('\n'):
+                name = name.strip()
+                if name:
+                    subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10)
+                    logger.info(f"Cleaned up orphan container: {name}")
    except Exception as e:
-        logger.debug(f"Preflight wget failed for {url}: {e}")
-        try:
-            os.unlink(tmp_path)
-        except Exception:
-            pass
-
-    # Step 2: Playwright headless fetch
-    browser_html = ''
-    browser_size = 0
-    browser_url = url
-    try:
-        from playwright.sync_api import sync_playwright
-        with sync_playwright() as p:
-            browser = p.chromium.launch(
-                headless=True,
-                args=['--no-sandbox', '--disable-dev-shm-usage']
-            )
-            page = browser.new_page()
-            page.goto(url, wait_until='networkidle', timeout=timeout * 1000)
-            browser_url = page.url
-            browser_html = page.content()
-            browser_size = len(browser_html.encode('utf-8'))
-            browser.close()
-    except Exception as e:
-        logger.debug(f"Preflight Playwright failed for {url}: {e}")
-        # If Playwright fails entirely, fall back to static
-        return 'static', url
-
-    # Step 3: Decision logic
-    browser_domain = urlparse(browser_url).hostname or ''
-    if browser_domain.startswith('www.'):
-        browser_domain = browser_domain[4:]
-
-    # Check for cross-domain redirect (parking page detection)
-    if browser_domain and input_domain and browser_domain != input_domain:
-        logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect")
-        return 'redirect', browser_url
-
-    # Check size disparity: small wget + large browser = JS-rendered
-    if wget_size < min_static and browser_size > min_browser:
-        logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser")
-        return 'browser', url
-
-    # Check for SPA shell markers in wget HTML
-    if wget_html:
-        try:
-            from bs4 import BeautifulSoup
-            soup = BeautifulSoup(wget_html, 'html.parser')
-            for marker in spa_markers:
-                # marker is like 'div#root' — split tag and id
-                parts = marker.split('#', 1)
-                tag = parts[0] if parts[0] else 'div'
-                elem_id = parts[1] if len(parts) > 1 else None
-                elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag)
-                if elem:
-                    text_content = elem.get_text(strip=True)
-                    if len(text_content) < 100:
-                        logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser")
-                        return 'browser', url
-        except Exception as e:
-            logger.debug(f"Preflight SPA marker check failed: {e}")
-
-    logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static")
-    return 'static', url
+        logger.warning(f"Orphan container cleanup failed: {e}")


-# ── Crawl backends ────────────────────────────────────────────────
+# ── Zimit crawl backend ──────────────────────────────────────────


-def _crawl_wget(job, url, site_dir, config, stop_event, db):
+def _crawl_zimit(job, config, stop_event, db):
    """
-    wget mirror crawl backend.
-    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
+    Crawl a URL using Zimit (openZIM Docker crawler).
+
+    Returns (page_count, zim_filename, error_msg).
+    On success: (count, filename, None)
+    On failure: (0, None, error_string)
    """
    job_id = job['id']
+    url = job['url']
+    title = job.get('title') or _sanitize_domain(url)
+    language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
+    category = job.get('category') or ''
+
    scraper_cfg = config.get('scraper', {})
-    rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5)
-    user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)')
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
-    workspace = os.path.dirname(site_dir)
+    output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
+    docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit')
+    docker_workers = scraper_cfg.get('docker_workers', 2)

-    # Build reject-regex from config defaults + per-job overrides
-    reject_patterns = []
-    skip_defaults = bool(job.get('skip_default_patterns'))
-    if not skip_defaults:
-        reject_patterns.extend(scraper_cfg.get('default_reject_patterns', []))
-    additional_raw = job.get('additional_reject_patterns')
-    if additional_raw:
-        try:
-            additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw
-            if isinstance(additional, list):
-                reject_patterns.extend(additional)
-        except (ValueError, TypeError):
-            pass
-
-    wget_cmd = [
-        'wget', '--mirror', '--convert-links', '--adjust-extension',
-        '--page-requisites', '--no-parent',
-        '--restrict-file-names=windows',
-        f'--wait={rate_limit_delay}', '--random-wait',
-        f'--user-agent={user_agent}',
-        f'--directory-prefix={site_dir}',
-        '--timeout=30', '--tries=3',
-    ]
-    if reject_patterns:
-        combined_regex = '|'.join(f'({p})' for p in reject_patterns)
-        wget_cmd.extend([f'--reject-regex={combined_regex}'])
-        logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns")
-    wget_cmd.append(url)
-
-    logger.info(f"Job {job_id}: wget mirror starting")
-    wget_log = os.path.join(workspace, 'wget.log')
-    try:
-        with open(wget_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                wget_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
-        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
-
-        while proc.poll() is None:
-            if stop_event.is_set() or _check_cancelled(db, job_id):
-                _kill_process(proc)
-                return 0, 'cancelled'
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                pass
-
-        db.update_scrape_job(job_id, subprocess_pid=None)
-
-        if stop_event.is_set() or _check_cancelled(db, job_id):
-            return 0, 'cancelled'
-
-        # wget returns 8 for some server errors but may still have useful content
-        if proc.returncode not in (0, 4, 6, 8):
-            output = ''
-            try:
-                with open(wget_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(wget_log) - 500))
-                    output = f.read()
-            except Exception:
-                pass
-            return 0, f"wget failed with code {proc.returncode}: {output[-500:]}"
-
-    except Exception as e:
-        return 0, f"wget error: {e}"
-
-    page_count = _count_html_files(site_dir)
-    logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found")
-
-    if page_count == 0:
-        return 0, 'wget produced no HTML files'
-
-    return page_count, None
-
-
-def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
-    """
-    SingleFile CLI crawl backend for JS-rendered sites.
-    Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
-    """
-    job_id = job['id']
-    scraper_cfg = config.get('scraper', {})
-    sf_cfg = scraper_cfg.get('singlefile', {})
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
-    workspace = os.path.dirname(site_dir)
-
-    executable = sf_cfg.get('executable', 'single-file')
-    chromium_path = _get_chromium_path(config)
-    crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
-
-    if not chromium_path:
-        return 0, 'Chromium not found — cannot use browser crawl mode'
-
-    # SingleFile outputs into site_dir/<domain>/ to match wget's structure
    domain = _sanitize_domain(url)
-    output_dir = os.path.join(site_dir, domain)
-    os.makedirs(output_dir, exist_ok=True)
+    date_tag = datetime.now().strftime('%Y-%m')
+    container_name = f'recon-scraper-{job_id}'
+    tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')

-    sf_cmd = [
-        executable,
-        '--crawl-links=true',
-        '--crawl-inner-links-only=true',
-        '--crawl-no-parent=true',
-        '--crawl-replace-URLs=true',
-        f'--crawl-max-depth={crawl_max_depth}',
-        f'--browser-executable-path={chromium_path}',
-        '--browser-headless=true',
-        '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
-        f'--output-directory={output_dir}',
-        url,
+    # Clean up any pre-existing container with same name (retry scenario)
+    subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10)
+
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    description = f"Mirror of {domain}"
+    if category:
+        description = f"{category} — mirror of {domain}"
+
+    docker_cmd = [
+        'docker', 'run', '--rm',
+        '--name', container_name,
+        '-v', f'{tmp_dir}:/output',
+        docker_image,
+        '--url', url,
+        '--name', _sanitize_filename(domain),
+        '--lang', language,
+        '--title', title,
+        '--description', description[:80],
+        '--output', '/output',
+        '--workers', str(docker_workers),
    ]

-    logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})")
-    sf_log = os.path.join(workspace, 'singlefile.log')
+    logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
    try:
-        with open(sf_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                sf_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
+        proc = subprocess.Popen(
+            docker_cmd,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
        db.update_scrape_job(job_id, subprocess_pid=proc.pid)

+        last_progress_check = 0
        while proc.poll() is None:
            if stop_event.is_set() or _check_cancelled(db, job_id):
+                # Stop the Docker container
+                subprocess.run(['docker', 'rm', '-f', container_name],
+                               capture_output=True, timeout=10)
                _kill_process(proc)
-                return 0, 'cancelled'
+                shutil.rmtree(tmp_dir, ignore_errors=True)
+                return 0, None, 'cancelled'
+
+            # Check progress every 30s via docker logs
+            now = time.time()
+            if now - last_progress_check >= 30:
+                last_progress_check = now
+                try:
+                    log_result = subprocess.run(
+                        ['docker', 'logs', '--tail', '20', container_name],
+                        capture_output=True, text=True, timeout=10
+                    )
+                    if log_result.returncode == 0 and log_result.stderr:
+                        # Zimit/Browsertrix logs page counts — look for numbers
+                        lines = log_result.stderr.strip().split('\n')
+                        for line in reversed(lines):
+                            # Look for patterns like "X pages" or page count indicators
+                            match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
+                            if match:
+                                count = int(match.group(1))
+                                if count > 0:
+                                    db.update_scrape_job(job_id, page_count=count)
+                                break
+                except Exception:
+                    pass
+
            try:
                proc.wait(timeout=5)
            except subprocess.TimeoutExpired:
@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
        db.update_scrape_job(job_id, subprocess_pid=None)

        if stop_event.is_set() or _check_cancelled(db, job_id):
-            return 0, 'cancelled'
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+            return 0, None, 'cancelled'

        if proc.returncode != 0:
-            output = ''
+            # Capture last 50 lines of docker logs for error context
+            error_msg = f"Zimit exited with code {proc.returncode}"
            try:
-                with open(sf_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(sf_log) - 500))
-                    output = f.read()
+                log_result = subprocess.run(
+                    ['docker', 'logs', '--tail', '50', container_name],
+                    capture_output=True, text=True, timeout=10
+                )
+                log_text = (log_result.stderr or log_result.stdout or '').strip()
+                if log_text:
+                    # Take last 500 chars
+                    error_msg += f": {log_text[-500:]}"
            except Exception:
                pass
-            # SingleFile may still produce some files even with non-zero exit
-            page_count = _count_html_files(site_dir)
-            if page_count == 0:
-                return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}"
-            logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing")
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+            return 0, None, error_msg

    except Exception as e:
-        return 0, f"SingleFile error: {e}"
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, f"Zimit error: {e}"

-    # If no index.html exists, rename the first HTML file to index.html
-    index_path = os.path.join(output_dir, 'index.html')
-    if not os.path.isfile(index_path):
-        for f in sorted(os.listdir(output_dir)):
-            if f.lower().endswith(('.html', '.htm')):
-                src = os.path.join(output_dir, f)
-                os.rename(src, index_path)
-                logger.info(f"Job {job_id}: renamed {f} → index.html")
-                break
+    # Find the output ZIM file
+    zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
+    if not zim_files:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, 'Zimit produced no ZIM file'

-    page_count = _count_html_files(site_dir)
-    logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found")
+    src_zim = zim_files[0]  # Should be exactly one

-    if page_count == 0:
-        return 0, 'SingleFile produced no HTML files'
+    # Get page count from file size as rough estimate if we don't have one
+    page_count = 0
+    try:
+        job_state = db.get_scrape_job(job_id)
+        page_count = job_state.get('page_count') or 0
+    except Exception:
+        pass

-    return page_count, None
+    # Rename to final location
+    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
+    zim_path = os.path.join(output_dir, zim_filename)
+    try:
+        shutil.move(src_zim, zim_path)
+    except Exception as e:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        return 0, None, f"Failed to move ZIM to output dir: {e}"
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+    logger.info(f"Job {job_id}: Zimit complete — {zim_filename}")
+
+    return page_count, zim_filename, None


 # ── Main job pipeline ─────────────────────────────────────────────
@ -458,183 +273,43 @@ def _process_job(job, config, stop_event):
    """Execute the full scrape pipeline for a single job."""
    db = StatusDB()
    job_id = job['id']
-    url = job['url']
-    title = job.get('title') or _sanitize_domain(url)
-    language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
-    category = job.get('category') or ''

-    scraper_cfg = config.get('scraper', {})
-    workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper')
-    output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
-    keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
+    logger.info(f"Job {job_id}: starting scrape of {job['url']}")

-    workspace = os.path.join(workspace_root, str(job_id))
-    site_dir = os.path.join(workspace, 'site')
-    os.makedirs(site_dir, exist_ok=True)
-
-    domain = _sanitize_domain(url)
-    date_tag = datetime.now().strftime('%Y-%m')
-    zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
-    zim_path = os.path.join(output_dir, zim_filename)
-
-    logger.info(f"Job {job_id}: starting scrape of {url}")
+    # ── Phase 1: Crawl via Zimit ───────────────────────────────────
    db.update_scrape_job(job_id,
                         status='scraping',
-                         workspace_path=workspace,
+                         crawl_mode='zimit',
                         started_at=_now())

-    # ── Phase 0: Pre-flight mode detection ─────────────────────────
    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
        return

-    pre_set = job.get('crawl_mode')
-    if pre_set:
-        crawl_mode, resolved_url = pre_set, url
-        logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}")
-    else:
-        crawl_mode, resolved_url = _detect_crawl_mode(url, config)
-        logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}")
-
-    db.update_scrape_job(job_id, crawl_mode=crawl_mode)
-
-    # If redirect detected, update domain/filename to match resolved URL
-    if crawl_mode == 'redirect' and resolved_url != url:
-        logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
-        domain = _sanitize_domain(resolved_url)
-        zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
-        zim_path = os.path.join(output_dir, zim_filename)
-
-    # ── Phase A: Crawl (dispatch to backend) ────────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
-
-    if crawl_mode == 'browser':
-        page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db)
-    else:  # 'static' or 'redirect'
-        page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db)
+    page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db)

    if error == 'cancelled':
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
        return
    elif error:
        db.update_scrape_job(job_id,
                             status='failed',
-                             error_message=error,
+                             error_message=error[:1000],
                             subprocess_pid=None,
                             completed_at=_now())
-        if not keep_workspace:
-            shutil.rmtree(workspace, ignore_errors=True)
        return

    db.update_scrape_job(job_id, page_count=page_count)

-    # ── Phase B: Prepare zimwriterfs inputs ────────────────────────
+    # ── Phase 2: Register with kiwix-serve ─────────────────────────
    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
+        _handle_cancel(db, job_id)
        return

-    welcome_page, content_dir = _find_welcome_page(site_dir, domain)
-    if welcome_page is None:
-        welcome_page = 'index.html'
-
-    illustration_path = os.path.join(workspace, 'illustration.png')
-    _create_placeholder_illustration(illustration_path)
-    illust_dest = os.path.join(content_dir, 'illustration.png')
-    shutil.copy2(illustration_path, illust_dest)
-
-    description = f"Mirror of {domain}"
-    if category:
-        description = f"{category} — mirror of {domain}"
-
-    logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})")
-    db.update_scrape_job(job_id, status='packaging')
-
-    # ── Phase C: zimwriterfs ───────────────────────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
-
-    zim_name = _sanitize_filename(domain)
-    long_description = f"Offline mirror of {resolved_url} created by RECON web scraper"
-
-    zim_cmd = [
-        'zimwriterfs',
-        f'--welcome={welcome_page}',
-        f'--illustration=illustration.png',
-        f'--language={language}',
-        f'--title={title}',
-        f'--description={description[:80]}',
-        f'--longDescription={long_description[:4096]}',
-        f'--name={zim_name}',
-        f'--creator={domain}',
-        '--publisher=RECON',
-        content_dir,
-        zim_path,
-    ]
-
-    zim_log = os.path.join(workspace, 'zimwriterfs.log')
-    try:
-        with open(zim_log, 'w') as log_fh:
-            proc = subprocess.Popen(
-                zim_cmd,
-                stdout=log_fh, stderr=subprocess.STDOUT,
-            )
-        db.update_scrape_job(job_id, subprocess_pid=proc.pid)
-
-        while proc.poll() is None:
-            if stop_event.is_set() or _check_cancelled(db, job_id):
-                _kill_process(proc)
-                _handle_cancel(db, job_id, workspace, keep_workspace)
-                return
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                pass
-
-        db.update_scrape_job(job_id, subprocess_pid=None)
-
-        if stop_event.is_set() or _check_cancelled(db, job_id):
-            _handle_cancel(db, job_id, workspace, keep_workspace)
-            return
-
-        if proc.returncode != 0:
-            output = ''
-            try:
-                with open(zim_log, 'r') as f:
-                    f.seek(max(0, os.path.getsize(zim_log) - 500))
-                    output = f.read()
-            except Exception:
-                pass
-            raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}")
-
-    except RuntimeError:
-        raise
-    except Exception as e:
-        db.update_scrape_job(job_id,
-                             status='failed',
-                             error_message=f"zimwriterfs error: {e}",
-                             subprocess_pid=None,
-                             completed_at=_now())
-        if not keep_workspace:
-            shutil.rmtree(workspace, ignore_errors=True)
-        return
-
-    if not os.path.isfile(zim_path):
-        db.update_scrape_job(job_id,
-                             status='failed',
-                             error_message='zimwriterfs produced no output file',
-                             completed_at=_now())
-        return
-
-    logger.info(f"Job {job_id}: ZIM created at {zim_path}")
-
-    # ── Phase D: kiwix-manage + registration ───────────────────────
-    if stop_event.is_set() or _check_cancelled(db, job_id):
-        _handle_cancel(db, job_id, workspace, keep_workspace)
-        return
+    db.update_scrape_job(job_id, status='registering')

+    output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix')
+    zim_path = os.path.join(output_dir, zim_filename)
    kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
    library_xml = '/mnt/kiwix/library.xml'

@ -670,26 +345,32 @@ def _process_job(job, config, stop_event):
    except Exception as e:
        logger.warning(f"Job {job_id}: scan_zims failed: {e}")

-    try:
-        shutil.rmtree(workspace, ignore_errors=True)
-    except Exception:
-        pass
-
+    # ── Phase 3: Complete ──────────────────────────────────────────
    db.update_scrape_job(job_id,
                         status='complete',
                         zim_filename=zim_filename,
                         zim_source_id=zim_source_id,
                         completed_at=_now())

-    logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})")
+    logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)")


-def _handle_cancel(db, job_id, workspace, keep_workspace):
-    """Handle job cancellation: clean up and update status."""
+def _handle_cancel(db, job_id):
+    """Handle job cancellation: clean up Docker container and update status."""
+    container_name = f'recon-scraper-{job_id}'
+    try:
+        subprocess.run(['docker', 'rm', '-f', container_name],
+                       capture_output=True, timeout=10)
+    except Exception:
+        pass
+
+    # Clean up tmp dir if it exists
+    output_dir = '/mnt/kiwix'
+    tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
    logger.info(f"Job {job_id}: cancelled")
    db.update_scrape_job(job_id,
                         status='cancelled',
                         subprocess_pid=None,
                         completed_at=_now())
-    if not keep_workspace:
-        shutil.rmtree(workspace, ignore_errors=True)