From da50e5f0b8f0be3f1e42b0b7f14f729834c66c62 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 18:26:43 +0000 Subject: [PATCH 01/11] Add scraper Phase 2: smart crawl mode detection + browser fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pre-flight detection: wget + Playwright probe to auto-detect if site needs browser rendering (JS apps, parking page redirects) - SingleFile CLI crawl backend for JS-rendered sites - crawl_mode column in scrape_jobs (static/browser/redirect/auto) - API: optional crawl_mode param on submit, cleared on retry - Config: rate_limit_delay 2.0→0.5, /api/ reject pattern, preflight + singlefile config sections - Prerequisites: Node.js 22, single-file-cli, Playwright + Chromium Co-Authored-By: Claude Opus 4.6 --- config.yaml | 78 +++++ lib/api.py | 108 +++++++ lib/scraper_runner.py | 695 ++++++++++++++++++++++++++++++++++++++++++ lib/status.py | 85 ++++++ recon.py | 11 + 5 files changed, 977 insertions(+) create mode 100644 lib/scraper_runner.py diff --git a/config.yaml b/config.yaml index 4b147fd..c98a866 100644 --- a/config.yaml +++ b/config.yaml @@ -413,6 +413,84 @@ peertube: rate_limit_delay: 0.5 # Delay between video ingestions (seconds) poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min) +scraper: + workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds + output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library) + rate_limit_delay: 0.5 # Seconds between wget requests (--wait) + wait_random: 1.0 # Random jitter added to wait (--random-wait range) + default_language: eng # ISO 639-3 language code for ZIM metadata + user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)" + poll_interval: 300 # Seconds between checking for pending scrape jobs + keep_workspace_on_failure: true # Retain workspace for debugging when a job fails + + # Default URL patterns rejected by wget --reject-regex. + # Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc. + # Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass). + default_reject_patterns: + # WordPress + - '\?share=' + - '\?replytocom=' + - '\?like_comment=' + - '/feed/' + - '/wp-json/' + - '/wp-login' + - '/wp-admin' + - '/wp-cron' + - '\?attachment_id=' + - '/xmlrpc' + - '/trackback' + - '/comment-page-' + - '\?doing_wp_cron' + # Squarespace + - '\?format=json' + - '\?format=rss' + - '/api/' + # Wix + - '/_api/' + - '/_partials/' + # Ghost + - '/ghost/' + - '/p/' + # Drupal + - '\?q=comment' + - '\?q=node' + - '/user/login' + - '/user/register' + # General CMS / site chrome + - '/login' + - '/signup' + - '/register' + - '/cart' + - '/checkout' + - '/search\?' + - '/tag/' + - '/author/' + - '\?print=' + - '\?pdf=' + - '\?format=amp' + - '\?preview=' + - '/rss' + - '/atom' + - '/cdn-cgi/' + + # Pre-flight mode detection + preflight: + enabled: true + timeout: 30 # Seconds for single-page Playwright fetch + min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site + min_browser_size: 20480 # 20KB - browser HTML above this confirms JS + spa_markers: + - 'div#root' + - 'div#app' + - 'div#__next' + + # SingleFile CLI settings (browser crawl mode) + singlefile: + executable: single-file + chromium_path: "" # Auto-detected from Playwright if empty + crawl_max_depth: 10 + crawl_delay: 2 # Seconds between page fetches + # Stream B: New Library Pipeline new_pipeline: # Disabled 2026-04-14 for refactor — see refactored-recon repo for context diff --git a/lib/api.py b/lib/api.py index a739ec0..cbb3377 100644 --- a/lib/api.py +++ b/lib/api.py @@ -2256,6 +2256,114 @@ def _build_kiwix_sources(): } + + +# ── Scraper API ── + +@app.route('/api/scraper/submit', methods=['POST']) +def api_scraper_submit(): + """Submit a new web scrape job.""" + data = request.get_json(silent=True) or {} + url = (data.get('url') or '').strip() + + if not url: + return jsonify({'error': 'url is required'}), 400 + if not url.startswith(('http://', 'https://')): + return jsonify({'error': 'URL must start with http:// or https://'}), 400 + + config = get_config() + scraper_cfg = config.get('scraper', {}) + language = data.get('language') or scraper_cfg.get('default_language', 'eng') + title = data.get('title', '').strip() or None + category = data.get('category', '').strip() or None + + # Optional per-job reject pattern overrides + additional_reject_patterns = data.get('additional_reject_patterns') + skip_default_patterns = bool(data.get('skip_default_patterns', False)) + + # Optional crawl mode override (static, browser, redirect, or null for auto-detect) + crawl_mode = data.get('crawl_mode') + if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'): + return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400 + + # Serialize additional patterns as JSON if provided + import json as _json + additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None + + db = StatusDB() + conn = db._get_conn() + conn.execute( + "INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)", + (url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode) + ) + conn.commit() + job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] + + logger.info(f"Scraper job {job_id} submitted: {url}") + return jsonify({'ok': True, 'job_id': job_id}), 201 + + +@app.route('/api/scraper/jobs') +def api_scraper_jobs(): + """List scrape jobs, optionally filtered by status.""" + status_filter = request.args.get('status') + db = StatusDB() + jobs = db.get_scrape_jobs(status=status_filter) + return jsonify({'jobs': jobs}) + + +@app.route('/api/scraper/cancel/', methods=['POST']) +def api_scraper_cancel(job_id): + """Cancel a scrape job.""" + import os as _os + import signal as _signal + + db = StatusDB() + job = db.get_scrape_job(job_id) + if not job: + return jsonify({'error': 'Job not found'}), 404 + + if job['status'] in ('complete', 'cancelled'): + return jsonify({'error': f"Job already {job['status']}"}), 400 + + # Set cancelled in DB — the runner loop checks this between phases + db.update_scrape_job(job_id, status='cancelled') + + # If there's an active subprocess, send SIGTERM + pid = job.get('subprocess_pid') + if pid: + try: + _os.kill(pid, _signal.SIGTERM) + except (ProcessLookupError, PermissionError): + pass # Process already gone + + logger.info(f"Scraper job {job_id} cancelled") + return jsonify({'ok': True}) + + +@app.route('/api/scraper/retry/', methods=['POST']) +def api_scraper_retry(job_id): + """Retry a failed or cancelled scrape job.""" + db = StatusDB() + job = db.get_scrape_job(job_id) + if not job: + return jsonify({'error': 'Job not found'}), 404 + + if job['status'] not in ('failed', 'cancelled'): + return jsonify({'error': f"Job status is '{job['status']}', can only retry failed or cancelled jobs"}), 400 + + db.update_scrape_job(job_id, + status='pending', + error_message=None, + subprocess_pid=None, + crawl_mode=None, + started_at=None, + completed_at=None) + + logger.info(f"Scraper job {job_id} reset to pending for retry") + return jsonify({'ok': True}) + + # ── Metrics API ── @app.route('/api/metrics/history') diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py new file mode 100644 index 0000000..1599f2e --- /dev/null +++ b/lib/scraper_runner.py @@ -0,0 +1,695 @@ +""" +RECON Scraper Runner + +Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage. +Supports two crawl backends: + - wget (static sites) — default + - SingleFile CLI (JS-rendered sites) — browser mode + +Pre-flight detection automatically chooses the right backend unless +crawl_mode is pre-set on the job. + +Public entry point: scraper_loop(stop_event, config). + +Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile) +DB table: scrape_jobs (status flow: pending → scraping → packaging → complete) +""" +import glob as _glob +import json as _json +import os +import re +import shutil +import signal +import subprocess +import tempfile +import time +from datetime import datetime, timezone +from urllib.parse import urlparse + +from .utils import setup_logging +from .status import StatusDB + +logger = setup_logging('recon.scraper_runner') + + +def scraper_loop(stop_event, config): + """Daemon loop: poll for pending scrape jobs, execute pipeline.""" + scraper_cfg = config.get('scraper', {}) + poll_interval = scraper_cfg.get('poll_interval', 300) + + logger.info("Scraper runner started") + + while not stop_event.is_set(): + db = StatusDB() + job = db.get_pending_scrape_job() + if job: + try: + _process_job(job, config, stop_event) + except Exception as e: + logger.error(f"Scraper job {job['id']} unexpected error: {e}", exc_info=True) + try: + db.update_scrape_job(job['id'], + status='failed', + error_message=str(e)[:1000], + subprocess_pid=None, + completed_at=_now()) + except Exception: + pass + else: + stop_event.wait(poll_interval) + + logger.info("Scraper runner stopped") + + +def _now(): + return datetime.now(timezone.utc).isoformat() + + +def _sanitize_domain(url): + """Extract and sanitize domain from URL for use in filenames.""" + parsed = urlparse(url) + domain = parsed.hostname or 'unknown' + if domain.startswith('www.'): + domain = domain[4:] + return domain + + +def _sanitize_filename(s): + """Sanitize a string for safe filename use.""" + return re.sub(r'[^a-zA-Z0-9._-]', '_', s) + + +def _check_cancelled(db, job_id): + """Check if a job has been cancelled in the DB.""" + job = db.get_scrape_job(job_id) + return job and job['status'] == 'cancelled' + + +def _kill_process(proc, timeout=5): + """Gracefully terminate a subprocess, force kill if needed.""" + if proc.poll() is not None: + return + try: + proc.terminate() + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=2) + + +def _count_html_files(directory): + """Count HTML files in a directory tree.""" + count = 0 + for root, dirs, files in os.walk(directory): + for f in files: + if f.lower().endswith(('.html', '.htm')): + count += 1 + return count + + +def _find_welcome_page(content_dir, domain): + """Find the welcome page (index.html) in the wget mirror.""" + domain_dir = None + for entry in os.listdir(content_dir): + entry_path = os.path.join(content_dir, entry) + if os.path.isdir(entry_path): + domain_dir = entry_path + break + + if not domain_dir: + return None, content_dir + + for candidate in ['index.html', 'index.htm']: + path = os.path.join(domain_dir, candidate) + if os.path.isfile(path): + return candidate, domain_dir + + for root, dirs, files in os.walk(domain_dir): + for f in sorted(files): + if f.lower().endswith(('.html', '.htm')): + rel = os.path.relpath(os.path.join(root, f), domain_dir) + return rel, domain_dir + + return 'index.html', domain_dir + + +def _create_placeholder_illustration(path): + """Create a 48x48 placeholder PNG for zimwriterfs --illustration.""" + from PIL import Image + img = Image.new('RGB', (48, 48), color=(40, 192, 232)) + img.save(path, 'PNG') + + +# ── Crawl mode detection ────────────────────────────────────────── + + +def _get_chromium_path(config): + """Auto-detect Chromium from Playwright's cache, or use config override.""" + configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '') + if configured and os.path.isfile(configured): + return configured + # Playwright stores Chromium — check both root and user caches + search_paths = [ + os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'), + '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome', + ] + for pattern in search_paths: + matches = sorted(_glob.glob(pattern)) + if matches: + return matches[-1] + return None + + +def _detect_crawl_mode(url, config): + """ + Pre-flight detection: determine whether a URL needs a browser to crawl. + + Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'. + 'redirect' means the URL redirected to a different domain (parking page etc.); + resolved_url will be the final browser URL in that case. + """ + preflight_cfg = config.get('scraper', {}).get('preflight', {}) + if not preflight_cfg.get('enabled', True): + return 'static', url + + timeout = preflight_cfg.get('timeout', 30) + min_static = preflight_cfg.get('min_static_size', 5120) + min_browser = preflight_cfg.get('min_browser_size', 20480) + spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next']) + + input_domain = urlparse(url).hostname or '' + if input_domain.startswith('www.'): + input_domain = input_domain[4:] + + # Step 1: wget single-page fetch + wget_html = '' + wget_size = 0 + try: + with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp: + tmp_path = tmp.name + result = subprocess.run( + ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url], + capture_output=True, text=True, timeout=timeout + 5 + ) + if os.path.isfile(tmp_path): + wget_size = os.path.getsize(tmp_path) + with open(tmp_path, 'r', errors='replace') as f: + wget_html = f.read() + os.unlink(tmp_path) + except Exception as e: + logger.debug(f"Preflight wget failed for {url}: {e}") + try: + os.unlink(tmp_path) + except Exception: + pass + + # Step 2: Playwright headless fetch + browser_html = '' + browser_size = 0 + browser_url = url + try: + from playwright.sync_api import sync_playwright + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=['--no-sandbox', '--disable-dev-shm-usage'] + ) + page = browser.new_page() + page.goto(url, wait_until='networkidle', timeout=timeout * 1000) + browser_url = page.url + browser_html = page.content() + browser_size = len(browser_html.encode('utf-8')) + browser.close() + except Exception as e: + logger.debug(f"Preflight Playwright failed for {url}: {e}") + # If Playwright fails entirely, fall back to static + return 'static', url + + # Step 3: Decision logic + browser_domain = urlparse(browser_url).hostname or '' + if browser_domain.startswith('www.'): + browser_domain = browser_domain[4:] + + # Check for cross-domain redirect (parking page detection) + if browser_domain and input_domain and browser_domain != input_domain: + logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect") + return 'redirect', browser_url + + # Check size disparity: small wget + large browser = JS-rendered + if wget_size < min_static and browser_size > min_browser: + logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser") + return 'browser', url + + # Check for SPA shell markers in wget HTML + if wget_html: + try: + from bs4 import BeautifulSoup + soup = BeautifulSoup(wget_html, 'html.parser') + for marker in spa_markers: + # marker is like 'div#root' — split tag and id + parts = marker.split('#', 1) + tag = parts[0] if parts[0] else 'div' + elem_id = parts[1] if len(parts) > 1 else None + elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag) + if elem: + text_content = elem.get_text(strip=True) + if len(text_content) < 100: + logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser") + return 'browser', url + except Exception as e: + logger.debug(f"Preflight SPA marker check failed: {e}") + + logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static") + return 'static', url + + +# ── Crawl backends ──────────────────────────────────────────────── + + +def _crawl_wget(job, url, site_dir, config, stop_event, db): + """ + wget mirror crawl backend. + Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. + """ + job_id = job['id'] + scraper_cfg = config.get('scraper', {}) + rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5) + user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)') + keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) + workspace = os.path.dirname(site_dir) + + # Build reject-regex from config defaults + per-job overrides + reject_patterns = [] + skip_defaults = bool(job.get('skip_default_patterns')) + if not skip_defaults: + reject_patterns.extend(scraper_cfg.get('default_reject_patterns', [])) + additional_raw = job.get('additional_reject_patterns') + if additional_raw: + try: + additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw + if isinstance(additional, list): + reject_patterns.extend(additional) + except (ValueError, TypeError): + pass + + wget_cmd = [ + 'wget', '--mirror', '--convert-links', '--adjust-extension', + '--page-requisites', '--no-parent', + '--restrict-file-names=windows', + f'--wait={rate_limit_delay}', '--random-wait', + f'--user-agent={user_agent}', + f'--directory-prefix={site_dir}', + '--timeout=30', '--tries=3', + ] + if reject_patterns: + combined_regex = '|'.join(f'({p})' for p in reject_patterns) + wget_cmd.extend([f'--reject-regex={combined_regex}']) + logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns") + wget_cmd.append(url) + + logger.info(f"Job {job_id}: wget mirror starting") + wget_log = os.path.join(workspace, 'wget.log') + try: + with open(wget_log, 'w') as log_fh: + proc = subprocess.Popen( + wget_cmd, + stdout=log_fh, stderr=subprocess.STDOUT, + ) + db.update_scrape_job(job_id, subprocess_pid=proc.pid) + + while proc.poll() is None: + if stop_event.is_set() or _check_cancelled(db, job_id): + _kill_process(proc) + return 0, 'cancelled' + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + pass + + db.update_scrape_job(job_id, subprocess_pid=None) + + if stop_event.is_set() or _check_cancelled(db, job_id): + return 0, 'cancelled' + + # wget returns 8 for some server errors but may still have useful content + if proc.returncode not in (0, 4, 6, 8): + output = '' + try: + with open(wget_log, 'r') as f: + f.seek(max(0, os.path.getsize(wget_log) - 500)) + output = f.read() + except Exception: + pass + return 0, f"wget failed with code {proc.returncode}: {output[-500:]}" + + except Exception as e: + return 0, f"wget error: {e}" + + page_count = _count_html_files(site_dir) + logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found") + + if page_count == 0: + return 0, 'wget produced no HTML files' + + return page_count, None + + +def _crawl_singlefile(job, url, site_dir, config, stop_event, db): + """ + SingleFile CLI crawl backend for JS-rendered sites. + Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. + """ + job_id = job['id'] + scraper_cfg = config.get('scraper', {}) + sf_cfg = scraper_cfg.get('singlefile', {}) + keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) + workspace = os.path.dirname(site_dir) + + executable = sf_cfg.get('executable', 'single-file') + chromium_path = _get_chromium_path(config) + crawl_max_depth = sf_cfg.get('crawl_max_depth', 10) + crawl_delay = sf_cfg.get('crawl_delay', 2) + + if not chromium_path: + return 0, 'Chromium not found — cannot use browser crawl mode' + + # SingleFile outputs into site_dir// to match wget's structure + domain = _sanitize_domain(url) + output_dir = os.path.join(site_dir, domain) + os.makedirs(output_dir, exist_ok=True) + + sf_cmd = [ + executable, + '--crawl-links=true', + '--crawl-inner-links-only=true', + f'--crawl-max-depth={crawl_max_depth}', + f'--crawl-delay={crawl_delay * 1000}', # milliseconds + f'--browser-executable-path={chromium_path}', + '--browser-headless=true', + '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', + f'--output-directory={output_dir}', + url, + ] + + logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)") + sf_log = os.path.join(workspace, 'singlefile.log') + try: + with open(sf_log, 'w') as log_fh: + proc = subprocess.Popen( + sf_cmd, + stdout=log_fh, stderr=subprocess.STDOUT, + ) + db.update_scrape_job(job_id, subprocess_pid=proc.pid) + + while proc.poll() is None: + if stop_event.is_set() or _check_cancelled(db, job_id): + _kill_process(proc) + return 0, 'cancelled' + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + pass + + db.update_scrape_job(job_id, subprocess_pid=None) + + if stop_event.is_set() or _check_cancelled(db, job_id): + return 0, 'cancelled' + + if proc.returncode != 0: + output = '' + try: + with open(sf_log, 'r') as f: + f.seek(max(0, os.path.getsize(sf_log) - 500)) + output = f.read() + except Exception: + pass + # SingleFile may still produce some files even with non-zero exit + page_count = _count_html_files(site_dir) + if page_count == 0: + return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}" + logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing") + + except Exception as e: + return 0, f"SingleFile error: {e}" + + # If no index.html exists, rename the first HTML file to index.html + index_path = os.path.join(output_dir, 'index.html') + if not os.path.isfile(index_path): + for f in sorted(os.listdir(output_dir)): + if f.lower().endswith(('.html', '.htm')): + src = os.path.join(output_dir, f) + os.rename(src, index_path) + logger.info(f"Job {job_id}: renamed {f} → index.html") + break + + page_count = _count_html_files(site_dir) + logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found") + + if page_count == 0: + return 0, 'SingleFile produced no HTML files' + + return page_count, None + + +# ── Main job pipeline ───────────────────────────────────────────── + + +def _process_job(job, config, stop_event): + """Execute the full scrape pipeline for a single job.""" + db = StatusDB() + job_id = job['id'] + url = job['url'] + title = job.get('title') or _sanitize_domain(url) + language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng') + category = job.get('category') or '' + + scraper_cfg = config.get('scraper', {}) + workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper') + output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix') + keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) + + workspace = os.path.join(workspace_root, str(job_id)) + site_dir = os.path.join(workspace, 'site') + os.makedirs(site_dir, exist_ok=True) + + domain = _sanitize_domain(url) + date_tag = datetime.now().strftime('%Y-%m') + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" + zim_path = os.path.join(output_dir, zim_filename) + + logger.info(f"Job {job_id}: starting scrape of {url}") + db.update_scrape_job(job_id, + status='scraping', + workspace_path=workspace, + started_at=_now()) + + # ── Phase 0: Pre-flight mode detection ───────────────────────── + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + pre_set = job.get('crawl_mode') + if pre_set: + crawl_mode, resolved_url = pre_set, url + logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}") + else: + crawl_mode, resolved_url = _detect_crawl_mode(url, config) + logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}") + + db.update_scrape_job(job_id, crawl_mode=crawl_mode) + + # If redirect detected, update domain/filename to match resolved URL + if crawl_mode == 'redirect' and resolved_url != url: + logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}") + domain = _sanitize_domain(resolved_url) + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" + zim_path = os.path.join(output_dir, zim_filename) + + # ── Phase A: Crawl (dispatch to backend) ──────────────────────── + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + if crawl_mode == 'browser': + page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db) + else: # 'static' or 'redirect' + page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db) + + if error == 'cancelled': + _handle_cancel(db, job_id, workspace, keep_workspace) + return + elif error: + db.update_scrape_job(job_id, + status='failed', + error_message=error, + subprocess_pid=None, + completed_at=_now()) + if not keep_workspace: + shutil.rmtree(workspace, ignore_errors=True) + return + + db.update_scrape_job(job_id, page_count=page_count) + + # ── Phase B: Prepare zimwriterfs inputs ──────────────────────── + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + welcome_page, content_dir = _find_welcome_page(site_dir, domain) + if welcome_page is None: + welcome_page = 'index.html' + + illustration_path = os.path.join(workspace, 'illustration.png') + _create_placeholder_illustration(illustration_path) + illust_dest = os.path.join(content_dir, 'illustration.png') + shutil.copy2(illustration_path, illust_dest) + + description = f"Mirror of {domain}" + if category: + description = f"{category} — mirror of {domain}" + + logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})") + db.update_scrape_job(job_id, status='packaging') + + # ── Phase C: zimwriterfs ─────────────────────────────────────── + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + zim_name = _sanitize_filename(domain) + long_description = f"Offline mirror of {resolved_url} created by RECON web scraper" + + zim_cmd = [ + 'zimwriterfs', + f'--welcome={welcome_page}', + f'--illustration=illustration.png', + f'--language={language}', + f'--title={title}', + f'--description={description[:80]}', + f'--longDescription={long_description[:4096]}', + f'--name={zim_name}', + f'--creator={domain}', + '--publisher=RECON', + content_dir, + zim_path, + ] + + zim_log = os.path.join(workspace, 'zimwriterfs.log') + try: + with open(zim_log, 'w') as log_fh: + proc = subprocess.Popen( + zim_cmd, + stdout=log_fh, stderr=subprocess.STDOUT, + ) + db.update_scrape_job(job_id, subprocess_pid=proc.pid) + + while proc.poll() is None: + if stop_event.is_set() or _check_cancelled(db, job_id): + _kill_process(proc) + _handle_cancel(db, job_id, workspace, keep_workspace) + return + try: + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + pass + + db.update_scrape_job(job_id, subprocess_pid=None) + + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + if proc.returncode != 0: + output = '' + try: + with open(zim_log, 'r') as f: + f.seek(max(0, os.path.getsize(zim_log) - 500)) + output = f.read() + except Exception: + pass + raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}") + + except RuntimeError: + raise + except Exception as e: + db.update_scrape_job(job_id, + status='failed', + error_message=f"zimwriterfs error: {e}", + subprocess_pid=None, + completed_at=_now()) + if not keep_workspace: + shutil.rmtree(workspace, ignore_errors=True) + return + + if not os.path.isfile(zim_path): + db.update_scrape_job(job_id, + status='failed', + error_message='zimwriterfs produced no output file', + completed_at=_now()) + return + + logger.info(f"Job {job_id}: ZIM created at {zim_path}") + + # ── Phase D: kiwix-manage + registration ─────────────────────── + if stop_event.is_set() or _check_cancelled(db, job_id): + _handle_cancel(db, job_id, workspace, keep_workspace) + return + + kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage' + library_xml = '/mnt/kiwix/library.xml' + + try: + subprocess.run( + [kiwix_manage, library_xml, 'add', zim_path], + capture_output=True, text=True, timeout=30 + ) + logger.info(f"Job {job_id}: registered with kiwix-serve library") + except Exception as e: + logger.warning(f"Job {job_id}: kiwix-manage add failed: {e}") + + try: + result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5) + if result.returncode == 0 and result.stdout.strip(): + pid = int(result.stdout.strip().split()[0]) + os.kill(pid, signal.SIGHUP) + logger.info(f"Job {job_id}: sent SIGHUP to kiwix-serve (pid {pid})") + except Exception as e: + logger.warning(f"Job {job_id}: failed to signal kiwix-serve: {e}") + + zim_source_id = None + try: + from .zim_monitor import scan_zims + scan_zims() + conn = db._get_conn() + row = conn.execute( + "SELECT id FROM zim_sources WHERE zim_filename = ?", (zim_filename,) + ).fetchone() + if row: + zim_source_id = row['id'] + logger.info(f"Job {job_id}: linked to zim_source_id={zim_source_id}") + except Exception as e: + logger.warning(f"Job {job_id}: scan_zims failed: {e}") + + try: + shutil.rmtree(workspace, ignore_errors=True) + except Exception: + pass + + db.update_scrape_job(job_id, + status='complete', + zim_filename=zim_filename, + zim_source_id=zim_source_id, + completed_at=_now()) + + logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})") + + +def _handle_cancel(db, job_id, workspace, keep_workspace): + """Handle job cancellation: clean up and update status.""" + logger.info(f"Job {job_id}: cancelled") + db.update_scrape_job(job_id, + status='cancelled', + subprocess_pid=None, + completed_at=_now()) + if not keep_workspace: + shutil.rmtree(workspace, ignore_errors=True) diff --git a/lib/status.py b/lib/status.py index 20cc77b..974cabd 100644 --- a/lib/status.py +++ b/lib/status.py @@ -105,6 +105,25 @@ class StatusDB: except Exception: pass # column already exists + # Migration: add subprocess_pid column to scrape_jobs if missing + try: + conn.execute("ALTER TABLE scrape_jobs ADD COLUMN subprocess_pid INTEGER") + except Exception: + pass # column already exists + + # Migration: add reject pattern columns to scrape_jobs if missing + for col, coltype in [('additional_reject_patterns', 'TEXT'), ('skip_default_patterns', 'INTEGER DEFAULT 0')]: + try: + conn.execute(f"ALTER TABLE scrape_jobs ADD COLUMN {col} {coltype}") + except Exception: + pass # column already exists + + # Migration: add crawl_mode column to scrape_jobs if missing + try: + conn.execute("ALTER TABLE scrape_jobs ADD COLUMN crawl_mode TEXT") + except Exception: + pass # column already exists + # Stream B: file_operations + duplicate_review tables conn.executescript(""" CREATE TABLE IF NOT EXISTS file_operations ( @@ -142,6 +161,28 @@ class StatusDB: resolved_at TEXT ); CREATE INDEX IF NOT EXISTS idx_dupreview_status ON duplicate_review(status); + + CREATE TABLE IF NOT EXISTS scrape_jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT NOT NULL, + title TEXT, + language TEXT DEFAULT 'eng', + category TEXT, + status TEXT DEFAULT 'pending', + page_count INTEGER DEFAULT 0, + error_message TEXT, + zim_filename TEXT, + zim_source_id INTEGER, + workspace_path TEXT, + subprocess_pid INTEGER, + additional_reject_patterns TEXT, + skip_default_patterns INTEGER DEFAULT 0, + crawl_mode TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + started_at TEXT, + completed_at TEXT + ); + CREATE INDEX IF NOT EXISTS idx_scrape_status ON scrape_jobs(status); """) conn.commit() @@ -406,6 +447,50 @@ class StatusDB: ) conn.commit() + + # ── Scraper Job Helpers ───────────────────────────────────── + + def get_pending_scrape_job(self): + """Fetch the oldest pending scrape job.""" + conn = self._get_conn() + row = conn.execute( + "SELECT * FROM scrape_jobs WHERE status = 'pending' ORDER BY id ASC LIMIT 1" + ).fetchone() + return dict(row) if row else None + + def update_scrape_job(self, job_id, **kwargs): + """Update arbitrary columns on a scrape job.""" + if not kwargs: + return + conn = self._get_conn() + sets = [] + vals = [] + for k, v in kwargs.items(): + sets.append(f"{k} = ?") + vals.append(v) + vals.append(job_id) + conn.execute(f"UPDATE scrape_jobs SET {', '.join(sets)} WHERE id = ?", vals) + conn.commit() + + def get_scrape_jobs(self, status=None): + """List scrape jobs, optionally filtered by status.""" + conn = self._get_conn() + if status: + rows = conn.execute( + "SELECT * FROM scrape_jobs WHERE status = ? ORDER BY id DESC", (status,) + ).fetchall() + else: + rows = conn.execute( + "SELECT * FROM scrape_jobs ORDER BY id DESC" + ).fetchall() + return [dict(r) for r in rows] + + def get_scrape_job(self, job_id): + """Get a single scrape job by ID.""" + conn = self._get_conn() + row = conn.execute("SELECT * FROM scrape_jobs WHERE id = ?", (job_id,)).fetchone() + return dict(row) if row else None + # ── Stream B: File Operations ─────────────────────────────────── def log_file_operation(self, doc_hash, operation, source_path, target_path, diff --git a/recon.py b/recon.py index 47dda7d..9635a59 100755 --- a/recon.py +++ b/recon.py @@ -692,12 +692,23 @@ def cmd_service(args): daemon=True, name='dashboard'), ] + # Scraper daemon: polls for pending scrape jobs, runs wget+zimwriterfs pipeline + scraper_cfg = config.get('scraper', {}) + if scraper_cfg.get('workspace'): + from lib.scraper_runner import scraper_loop + threads.append( + threading.Thread(target=lambda: scraper_loop(stop_event, config), + daemon=True, name='scraper') + ) + logger.info("=== RECON Service Starting ===") logger.info(f" Dashboard: {web_host}:{web_port}") logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}") logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s") pt_interval = config.get("peertube", {}).get("poll_interval", 1800) logger.info(f" PeerTube acquisition: every {pt_interval}s") + if scraper_cfg.get('workspace'): + logger.info(f" Scraper: every {scraper_cfg.get('poll_interval', 300)}s") logger.info(f" Progress: every {progress_interval}s") for t in threads: From 125602fa1369ea0b1ec7a98406e2321473e428d1 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 19:28:03 +0000 Subject: [PATCH 02/11] Fix SingleFile CLI: remove invalid --crawl-delay flag SingleFile CLI has no --crawl-delay option. The invalid flag caused the process to print help and exit with no output. Added --crawl-no-parent and --crawl-replace-URLs instead. Removed unused crawl_delay config key. Co-Authored-By: Claude Opus 4.6 --- config.yaml | 1 - lib/scraper_runner.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/config.yaml b/config.yaml index c98a866..bdabf69 100644 --- a/config.yaml +++ b/config.yaml @@ -489,7 +489,6 @@ scraper: executable: single-file chromium_path: "" # Auto-detected from Playwright if empty crawl_max_depth: 10 - crawl_delay: 2 # Seconds between page fetches # Stream B: New Library Pipeline new_pipeline: diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index 1599f2e..a3ff820 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -368,7 +368,6 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): executable = sf_cfg.get('executable', 'single-file') chromium_path = _get_chromium_path(config) crawl_max_depth = sf_cfg.get('crawl_max_depth', 10) - crawl_delay = sf_cfg.get('crawl_delay', 2) if not chromium_path: return 0, 'Chromium not found — cannot use browser crawl mode' @@ -382,8 +381,9 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): executable, '--crawl-links=true', '--crawl-inner-links-only=true', + '--crawl-no-parent=true', + '--crawl-replace-URLs=true', f'--crawl-max-depth={crawl_max_depth}', - f'--crawl-delay={crawl_delay * 1000}', # milliseconds f'--browser-executable-path={chromium_path}', '--browser-headless=true', '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', @@ -391,7 +391,7 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): url, ] - logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)") + logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})") sf_log = os.path.join(workspace, 'singlefile.log') try: with open(sf_log, 'w') as log_fh: From 45b954fccc3b60ecf182e6ea55f12692916b894d Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 20:17:53 +0000 Subject: [PATCH 03/11] Fix ZIM filename collisions by appending job ID Format: {domain}_{lang}_{YYYY-MM}_{job_id}.zim Prevents zimwriterfs failures when the same domain is scraped multiple times in the same month. Co-Authored-By: Claude Opus 4.6 --- lib/scraper_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index a3ff820..280b874 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -474,7 +474,7 @@ def _process_job(job, config, stop_event): domain = _sanitize_domain(url) date_tag = datetime.now().strftime('%Y-%m') - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" zim_path = os.path.join(output_dir, zim_filename) logger.info(f"Job {job_id}: starting scrape of {url}") @@ -502,7 +502,7 @@ def _process_job(job, config, stop_event): if crawl_mode == 'redirect' and resolved_url != url: logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}") domain = _sanitize_domain(resolved_url) - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" zim_path = os.path.join(output_dir, zim_filename) # ── Phase A: Crawl (dispatch to backend) ──────────────────────── From 1ce9a3731f566aaba230b91905477dfb37a2b636 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 20:47:17 +0000 Subject: [PATCH 04/11] Add scraper dashboard UI under Kiwix tab New /kiwix/scraper page with submit form (URL, title, language, crawl mode), stats cards, and auto-refreshing jobs table with cancel/retry actions. Kiwix section now has Library/Scraper subnav. Co-Authored-By: Claude Opus 4.6 --- lib/api.py | 11 ++- static/css/recon.css | 1 + static/js/scraper.js | 155 +++++++++++++++++++++++++++++++++++ templates/kiwix/scraper.html | 91 ++++++++++++++++++++ 4 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 static/js/scraper.js create mode 100644 templates/kiwix/scraper.html diff --git a/lib/api.py b/lib/api.py index cbb3377..aa13a39 100644 --- a/lib/api.py +++ b/lib/api.py @@ -60,7 +60,10 @@ PEERTUBE_SUBNAV = [ ] -KIWIX_SUBNAV = [] # Single-page, no subnav needed +KIWIX_SUBNAV = [ + {'href': '/kiwix', 'label': 'Library'}, + {'href': '/kiwix/scraper', 'label': 'Scraper'}, +] SETTINGS_SUBNAV = [ {'href': '/settings/keys', 'label': 'API Keys'}, {'href': '/settings/cookies', 'label': 'YouTube Cookies'}, @@ -1956,6 +1959,12 @@ def kiwix_dashboard(): domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix') +@app.route('/kiwix/scraper') +def kiwix_scraper(): + return render_template('kiwix/scraper.html', + domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix/scraper') + + @app.route('/api/kiwix/sources') def api_kiwix_sources(): """Serve pre-cached Kiwix sources data (never blocks).""" diff --git a/static/css/recon.css b/static/css/recon.css index 31d6306..a272876 100644 --- a/static/css/recon.css +++ b/static/css/recon.css @@ -331,3 +331,4 @@ tr:hover { background: var(--bg-secondary); } .badge-detected { background: #333; color: #888; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } .badge-processing { background: #4a3a1a; color: #f59e0b; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } .badge-extracting { background: #1a3a5a; color: #0ea5e9; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } +.badge-failed { background: #4a1a1a; color: #ff4444; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; } diff --git a/static/js/scraper.js b/static/js/scraper.js new file mode 100644 index 0000000..6aa23d7 --- /dev/null +++ b/static/js/scraper.js @@ -0,0 +1,155 @@ +/* RECON Scraper Dashboard JS */ +(function() { + 'use strict'; + + function loadJobs() { + return RECON.fetchJSON('/api/scraper/jobs').then(function(data) { + var jobs = data.jobs || []; + + // Stats + var total = jobs.length; + var active = 0, complete = 0, failed = 0; + jobs.forEach(function(j) { + if (j.status === 'complete') complete++; + else if (j.status === 'failed') failed++; + else if (j.status === 'running' || j.status === 'pending') active++; + }); + RECON.set('sc-total', RECON.fmt(total)); + RECON.set('sc-active', RECON.fmt(active)); + RECON.set('sc-complete', RECON.fmt(complete)); + RECON.set('sc-failed', RECON.fmt(failed)); + + // Table + var html = ''; + jobs.forEach(function(j) { + var badge = statusBadge(j.status); + var mode = j.crawl_mode ? + '' + j.crawl_mode + '' : '\u2014'; + var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014'; + var zim = j.zim_filename ? + '' + j.zim_filename + '' : '\u2014'; + var actions = ''; + + if (j.status === 'running' || j.status === 'pending') { + actions = ''; + } else if (j.status === 'failed' || j.status === 'cancelled') { + actions = ''; + } + + // Truncate URL for display + var displayUrl = j.url.length > 40 ? j.url.substring(0, 40) + '\u2026' : j.url; + + html += '' + + '' + j.id + '' + + '' + escHtml(displayUrl) + '' + + '' + escHtml(j.title || '\u2014') + '' + + '' + mode + '' + + '' + pages + '' + + '' + badge + errorTooltip(j) + '' + + '' + zim + '' + + '' + actions + '' + + ''; + }); + if (!html) html = 'No scrape jobs'; + RECON.setHTML('sc-table-body', html); + }).catch(function(err) { + console.error('Scraper dashboard error:', err); + }); + } + + function statusBadge(status) { + var map = { + 'pending': 'PENDING', + 'running': 'RUNNING', + 'complete': 'COMPLETE', + 'failed': 'FAILED', + 'cancelled': 'CANCELLED' + }; + return map[status] || '' + (status || 'UNKNOWN').toUpperCase() + ''; + } + + function errorTooltip(job) { + if (!job.error_message) return ''; + var short = job.error_message.length > 80 ? + job.error_message.substring(0, 80) + '\u2026' : job.error_message; + return '
' + escHtml(short) + '
'; + } + + function escHtml(str) { + if (!str) return ''; + return str.replace(/&/g, '&').replace(//g, '>') + .replace(/"/g, '"').replace(/'/g, '''); + } + + function submit(e) { + e.preventDefault(); + var url = document.getElementById('sf-url').value.trim(); + if (!url) return false; + + var body = { url: url }; + var title = document.getElementById('sf-title').value.trim(); + var lang = document.getElementById('sf-lang').value; + var category = document.getElementById('sf-category').value.trim(); + var mode = document.getElementById('sf-mode').value; + + if (title) body.title = title; + if (lang) body.language = lang; + if (category) body.category = category; + if (mode) body.crawl_mode = mode; + + var btn = document.getElementById('sf-submit-btn'); + var feedback = document.getElementById('sf-feedback'); + btn.disabled = true; + btn.textContent = 'Submitting...'; + + RECON.postJSON('/api/scraper/submit', body).then(function(data) { + btn.disabled = false; + btn.textContent = 'Submit'; + if (data.ok) { + feedback.style.display = 'block'; + feedback.style.color = '#00ff41'; + feedback.textContent = 'Job #' + data.job_id + ' submitted successfully'; + document.getElementById('sf-url').value = ''; + document.getElementById('sf-title').value = ''; + document.getElementById('sf-category').value = ''; + setTimeout(function() { feedback.style.display = 'none'; }, 4000); + loadJobs(); + } else { + feedback.style.display = 'block'; + feedback.style.color = '#ff4444'; + feedback.textContent = 'Error: ' + (data.error || 'Unknown error'); + } + }).catch(function(err) { + btn.disabled = false; + btn.textContent = 'Submit'; + feedback.style.display = 'block'; + feedback.style.color = '#ff4444'; + feedback.textContent = 'Network error: ' + err.message; + }); + + return false; + } + + function cancel(jobId) { + if (!confirm('Cancel job #' + jobId + '?')) return; + RECON.postJSON('/api/scraper/cancel/' + jobId).then(function(data) { + if (data.ok) loadJobs(); + else alert('Error: ' + (data.error || 'Unknown')); + }); + } + + function retry(jobId) { + RECON.postJSON('/api/scraper/retry/' + jobId).then(function(data) { + if (data.ok) loadJobs(); + else alert('Error: ' + (data.error || 'Unknown')); + }); + } + + // Expose for inline onclick + window.SCRAPER = { submit: submit, cancel: cancel, retry: retry }; + + document.addEventListener('DOMContentLoaded', function() { + RECON.startRefresh(loadJobs, 10000); + }); +})(); diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html new file mode 100644 index 0000000..53d3e23 --- /dev/null +++ b/templates/kiwix/scraper.html @@ -0,0 +1,91 @@ +{% extends "base.html" %} +{% block content %} +
+ +
+

Submit Scrape Job

+
+
+
+ + +
+
+ + +
+
+
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+ +
+
+ + +
+
Total Jobs
+
Active
+
Complete
+
Failed
+
+ + +
+

Scrape Jobs

+ + + + + + + + + + + + + + + + +
IDURLTitleModePagesStatusZIM
Loading...
+
+
+{% endblock %} +{% block scripts %} + +{% endblock %} From 45c3bb8d56d431e32fc8ecc5b57aa5cc65c488c2 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 21:03:39 +0000 Subject: [PATCH 05/11] Add scraper job queue management (delete, clear failed) New API endpoints: DELETE single job, clear all failed/cancelled. Dashboard now shows Delete buttons on completed/failed jobs, Retry+Delete on failed jobs, and a Clear Failed bulk action. Co-Authored-By: Claude Opus 4.6 --- lib/api.py | 30 ++++++++++++++++++++++++++++++ static/js/scraper.js | 29 ++++++++++++++++++++++++++--- templates/kiwix/scraper.html | 5 ++++- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/lib/api.py b/lib/api.py index aa13a39..ce0381f 100644 --- a/lib/api.py +++ b/lib/api.py @@ -2373,6 +2373,36 @@ def api_scraper_retry(job_id): return jsonify({'ok': True}) +@app.route('/api/scraper/delete/', methods=['POST']) +def api_scraper_delete(job_id): + """Delete a scrape job (only if not currently running).""" + db = StatusDB() + job = db.get_scrape_job(job_id) + if not job: + return jsonify({'error': 'Job not found'}), 404 + + if job['status'] == 'running': + return jsonify({'error': 'Cannot delete a running job — cancel it first'}), 400 + + conn = db._get_conn() + conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,)) + conn.commit() + logger.info(f"Scraper job {job_id} deleted") + return jsonify({'ok': True}) + + +@app.route('/api/scraper/clear-failed', methods=['POST']) +def api_scraper_clear_failed(): + """Delete all failed and cancelled scrape jobs.""" + db = StatusDB() + conn = db._get_conn() + result = conn.execute("DELETE FROM scrape_jobs WHERE status IN ('failed', 'cancelled')") + conn.commit() + count = result.rowcount + logger.info(f"Cleared {count} failed/cancelled scraper jobs") + return jsonify({'ok': True, 'deleted': count}) + + # ── Metrics API ── @app.route('/api/metrics/history') diff --git a/static/js/scraper.js b/static/js/scraper.js index 6aa23d7..49ce178 100644 --- a/static/js/scraper.js +++ b/static/js/scraper.js @@ -11,7 +11,7 @@ var active = 0, complete = 0, failed = 0; jobs.forEach(function(j) { if (j.status === 'complete') complete++; - else if (j.status === 'failed') failed++; + else if (j.status === 'failed' || j.status === 'cancelled') failed++; else if (j.status === 'running' || j.status === 'pending') active++; }); RECON.set('sc-total', RECON.fmt(total)); @@ -19,6 +19,10 @@ RECON.set('sc-complete', RECON.fmt(complete)); RECON.set('sc-failed', RECON.fmt(failed)); + // Show/hide Clear Failed button + var clearBtn = document.getElementById('sc-clear-btn'); + if (clearBtn) clearBtn.style.display = failed > 0 ? '' : 'none'; + // Table var html = ''; jobs.forEach(function(j) { @@ -33,7 +37,10 @@ if (j.status === 'running' || j.status === 'pending') { actions = ''; } else if (j.status === 'failed' || j.status === 'cancelled') { - actions = ''; + actions = ' ' + + ''; + } else if (j.status === 'complete') { + actions = ''; } // Truncate URL for display @@ -146,8 +153,24 @@ }); } + function remove(jobId) { + if (!confirm('Delete job #' + jobId + '? This cannot be undone.')) return; + RECON.postJSON('/api/scraper/delete/' + jobId).then(function(data) { + if (data.ok) loadJobs(); + else alert('Error: ' + (data.error || 'Unknown')); + }); + } + + function clearFailed() { + if (!confirm('Delete all failed and cancelled jobs?')) return; + RECON.postJSON('/api/scraper/clear-failed').then(function(data) { + if (data.ok) loadJobs(); + else alert('Error: ' + (data.error || 'Unknown')); + }); + } + // Expose for inline onclick - window.SCRAPER = { submit: submit, cancel: cancel, retry: retry }; + window.SCRAPER = { submit: submit, cancel: cancel, retry: retry, remove: remove, clearFailed: clearFailed }; document.addEventListener('DOMContentLoaded', function() { RECON.startRefresh(loadJobs, 10000); diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html index 53d3e23..3c42f43 100644 --- a/templates/kiwix/scraper.html +++ b/templates/kiwix/scraper.html @@ -65,7 +65,10 @@
-

Scrape Jobs

+
+

Scrape Jobs

+ +
From f0b160ef7ca8fd0097e6cec91df27ad3701b953c Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 02:28:49 +0000 Subject: [PATCH 06/11] Extract _full_zim_cleanup helper, add SIGHUP + scrape_jobs cleanup - Extract shared _full_zim_cleanup(source_id) from api_kiwix_remove - Add SIGHUP to kiwix-serve after kiwix-manage remove - Delete linked scrape_jobs rows during ZIM removal - Update api_scraper_delete to do full ZIM cleanup when applicable - Set chromium_path for single-file browser crawl support - Add status.db to .gitignore Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + config.yaml | 2 +- lib/api.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 87 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 3fb01ef..bce13d8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ recon.db # Kiwix binary tools (installed from tarball) bin/ +status.db diff --git a/config.yaml b/config.yaml index bdabf69..082be93 100644 --- a/config.yaml +++ b/config.yaml @@ -487,7 +487,7 @@ scraper: # SingleFile CLI settings (browser crawl mode) singlefile: executable: single-file - chromium_path: "" # Auto-detected from Playwright if empty + chromium_path: "/usr/bin/chromium-browser" crawl_max_depth: 10 # Stream B: New Library Pipeline diff --git a/lib/api.py b/lib/api.py index ce0381f..b5cb8b5 100644 --- a/lib/api.py +++ b/lib/api.py @@ -2060,23 +2060,24 @@ def api_kiwix_upload(): -@app.route('/api/kiwix/remove/', methods=['POST']) -def api_kiwix_remove(source_id): - """Remove a ZIM source: delete vectors, DB records, library entry, and file.""" +def _full_zim_cleanup(source_id): + """Full ZIM cleanup: Qdrant vectors, DB records, kiwix-manage, SIGHUP, file delete. + Returns dict with results. Caller handles cache refresh.""" import subprocess + import signal import requests as req db = StatusDB() conn = db._get_conn() row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone() if not row: - return jsonify({'error': 'Source not found'}), 404 + return None zim_source = dict(row) zim_filename = zim_source['zim_filename'] zim_path = zim_source['zim_path'] zim_title = zim_source.get('title', zim_filename) - results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False} + results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False, 'scrape_jobs_deleted': 0} # Step 1: Find all document hashes for this ZIM source doc_hashes = [r['hash'] for r in conn.execute( @@ -2135,7 +2136,6 @@ def api_kiwix_remove(source_id): # Step 4: Remove from kiwix-serve library try: - # Get the book ID from library.xml subprocess.run( ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')], capture_output=True, text=True, timeout=10 @@ -2143,6 +2143,16 @@ def api_kiwix_remove(source_id): except Exception as e: logger.warning(f"kiwix-manage remove failed: {e}") + # Step 4b: SIGHUP kiwix-serve to reload library + try: + result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5) + if result.returncode == 0 and result.stdout.strip(): + pid = int(result.stdout.strip().split()[0]) + os.kill(pid, signal.SIGHUP) + logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})") + except Exception as e: + logger.warning(f"Failed to signal kiwix-serve: {e}") + # Step 5: Delete the ZIM file if os.path.isfile(zim_path): try: @@ -2152,13 +2162,37 @@ def api_kiwix_remove(source_id): logger.warning(f"ZIM file delete failed: {e}") results['file_deleted'] = False + # Step 6: Delete any linked scrape_jobs rows + try: + res = conn.execute("DELETE FROM scrape_jobs WHERE zim_source_id = ?", (source_id,)) + conn.commit() + results['scrape_jobs_deleted'] = res.rowcount + except Exception as e: + logger.warning(f"scrape_jobs cleanup failed: {e}") + + logger.info(f"Full ZIM cleanup for source {source_id} ('{zim_title}'): {results}") + return results + + +@app.route('/api/kiwix/remove/', methods=['POST']) +def api_kiwix_remove(source_id): + """Remove a ZIM source: delete vectors, DB records, library entry, and file.""" + db = StatusDB() + conn = db._get_conn() + row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone() + if not row: + return jsonify({'error': 'Source not found'}), 404 + + results = _full_zim_cleanup(source_id) + if results is None: + return jsonify({'error': 'Source not found during cleanup'}), 404 + # Refresh cache try: _cache['kiwix_sources'] = _build_kiwix_sources() except Exception: pass - logger.info(f"Removed ZIM source '{zim_title}': {results}") return jsonify({'ok': True, 'results': results}) @@ -2375,20 +2409,60 @@ def api_scraper_retry(job_id): @app.route('/api/scraper/delete/', methods=['POST']) def api_scraper_delete(job_id): - """Delete a scrape job (only if not currently running).""" + """Delete a scrape job and clean up any associated ZIM artifacts.""" + import subprocess + import signal + db = StatusDB() job = db.get_scrape_job(job_id) if not job: return jsonify({'error': 'Job not found'}), 404 if job['status'] == 'running': - return jsonify({'error': 'Cannot delete a running job — cancel it first'}), 400 + return jsonify({'error': 'Cannot delete a running job \u2014 cancel it first'}), 400 + zim_cleanup_results = None + + # If the job has a linked zim_source, do full cleanup + if job.get('zim_source_id'): + zim_cleanup_results = _full_zim_cleanup(job['zim_source_id']) + try: + _cache['kiwix_sources'] = _build_kiwix_sources() + except Exception: + pass + elif job.get('zim_filename'): + # No zim_source row, but there may be an orphan file + library entry + zim_path = os.path.join('/mnt/kiwix', job['zim_filename']) + if os.path.isfile(zim_path): + try: + os.remove(zim_path) + logger.info(f"Deleted orphan ZIM file: {zim_path}") + except Exception as e: + logger.warning(f"Failed to delete orphan ZIM file {zim_path}: {e}") + try: + subprocess.run( + ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', + job['zim_filename'].replace('.zim', '')], + capture_output=True, text=True, timeout=10 + ) + except Exception as e: + logger.warning(f"kiwix-manage remove failed for orphan: {e}") + try: + result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5) + if result.returncode == 0 and result.stdout.strip(): + pid = int(result.stdout.strip().split()[0]) + os.kill(pid, signal.SIGHUP) + logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})") + except Exception as e: + logger.warning(f"Failed to signal kiwix-serve: {e}") + + # Delete the scrape_jobs row (may already be gone if _full_zim_cleanup deleted it) conn = db._get_conn() conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,)) conn.commit() - logger.info(f"Scraper job {job_id} deleted") - return jsonify({'ok': True}) + + logger.info(f"Scraper job {job_id} deleted (zim_cleanup={zim_cleanup_results})") + return jsonify({'ok': True, 'zim_cleanup': zim_cleanup_results}) @app.route('/api/scraper/clear-failed', methods=['POST']) From 8945c82e3f16f248d06314600135209659af1867 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 14:06:23 +0000 Subject: [PATCH 07/11] Replace wget/SingleFile/Playwright backends with Zimit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Zimit Docker container handles all site types (static, SPA, JS redirects) - Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic - Added: _crawl_zimit() with Docker lifecycle management - Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done - No more zimwriterfs step — Zimit produces ZIM directly - Dashboard UI simplified: removed crawl mode dropdown - Config simplified: removed reject patterns, preflight, singlefile sections Co-Authored-By: Claude Opus 4.6 --- config.yaml | 75 +--- lib/api.py | 67 ++-- lib/scraper_runner.py | 647 +++++++++-------------------------- static/js/scraper.js | 15 +- templates/kiwix/scraper.html | 14 +- 5 files changed, 212 insertions(+), 606 deletions(-) diff --git a/config.yaml b/config.yaml index 082be93..a2709b0 100644 --- a/config.yaml +++ b/config.yaml @@ -414,81 +414,12 @@ peertube: poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min) scraper: - workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds + workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output) output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library) - rate_limit_delay: 0.5 # Seconds between wget requests (--wait) - wait_random: 1.0 # Random jitter added to wait (--random-wait range) default_language: eng # ISO 639-3 language code for ZIM metadata - user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)" poll_interval: 300 # Seconds between checking for pending scrape jobs - keep_workspace_on_failure: true # Retain workspace for debugging when a job fails - - # Default URL patterns rejected by wget --reject-regex. - # Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc. - # Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass). - default_reject_patterns: - # WordPress - - '\?share=' - - '\?replytocom=' - - '\?like_comment=' - - '/feed/' - - '/wp-json/' - - '/wp-login' - - '/wp-admin' - - '/wp-cron' - - '\?attachment_id=' - - '/xmlrpc' - - '/trackback' - - '/comment-page-' - - '\?doing_wp_cron' - # Squarespace - - '\?format=json' - - '\?format=rss' - - '/api/' - # Wix - - '/_api/' - - '/_partials/' - # Ghost - - '/ghost/' - - '/p/' - # Drupal - - '\?q=comment' - - '\?q=node' - - '/user/login' - - '/user/register' - # General CMS / site chrome - - '/login' - - '/signup' - - '/register' - - '/cart' - - '/checkout' - - '/search\?' - - '/tag/' - - '/author/' - - '\?print=' - - '\?pdf=' - - '\?format=amp' - - '\?preview=' - - '/rss' - - '/atom' - - '/cdn-cgi/' - - # Pre-flight mode detection - preflight: - enabled: true - timeout: 30 # Seconds for single-page Playwright fetch - min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site - min_browser_size: 20480 # 20KB - browser HTML above this confirms JS - spa_markers: - - 'div#root' - - 'div#app' - - 'div#__next' - - # SingleFile CLI settings (browser crawl mode) - singlefile: - executable: single-file - chromium_path: "/usr/bin/chromium-browser" - crawl_max_depth: 10 + docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling + docker_workers: 2 # Concurrent crawl workers inside Zimit container # Stream B: New Library Pipeline new_pipeline: diff --git a/lib/api.py b/lib/api.py index b5cb8b5..6a3d627 100644 --- a/lib/api.py +++ b/lib/api.py @@ -44,6 +44,20 @@ app = Flask(__name__, app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB + +# ── Large ZIM upload support ── +# Override stream factory so ZIM uploads write directly to /mnt/kiwix/ +# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs). +from flask import Request as _FlaskRequest + +class _LargeZimRequest(_FlaskRequest): + def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None): + if filename and filename.lower().endswith('.zim'): + return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False) + return super()._get_file_stream(total_content_length, content_type, filename, content_length) + +app.request_class = _LargeZimRequest + # ── Navigation Constants ── KNOWLEDGE_SUBNAV = [ @@ -2020,14 +2034,23 @@ def api_kiwix_upload(): filename = secure_filename(f.filename) dest = os.path.join('/mnt/kiwix', filename) - tmp_dest = dest + '.tmp' try: - f.save(tmp_dest) - os.rename(tmp_dest, dest) + # Stream was written directly to /mnt/kiwix/ by _LargeZimRequest — + # rename in-place instead of copying 100GB+ through f.save() + if hasattr(f.stream, 'name') and f.stream.name: + tmp_path = f.stream.name + f.stream.close() + os.rename(tmp_path, dest) + else: + tmp_dest = dest + '.tmp' + f.save(tmp_dest) + os.rename(tmp_dest, dest) except Exception as e: - if os.path.exists(tmp_dest): - os.remove(tmp_dest) + # Clean up any temp files on failure + for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]: + if p and os.path.exists(p): + os.remove(p) return jsonify({'error': f'Save failed: {e}'}), 500 # Register with kiwix-serve library @@ -2320,24 +2343,11 @@ def api_scraper_submit(): title = data.get('title', '').strip() or None category = data.get('category', '').strip() or None - # Optional per-job reject pattern overrides - additional_reject_patterns = data.get('additional_reject_patterns') - skip_default_patterns = bool(data.get('skip_default_patterns', False)) - - # Optional crawl mode override (static, browser, redirect, or null for auto-detect) - crawl_mode = data.get('crawl_mode') - if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'): - return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400 - - # Serialize additional patterns as JSON if provided - import json as _json - additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None - db = StatusDB() conn = db._get_conn() conn.execute( - "INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)", - (url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode) + "INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)", + (url, title, language, category, 'zimit') ) conn.commit() job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] @@ -2358,8 +2368,6 @@ def api_scraper_jobs(): @app.route('/api/scraper/cancel/', methods=['POST']) def api_scraper_cancel(job_id): """Cancel a scrape job.""" - import os as _os - import signal as _signal db = StatusDB() job = db.get_scrape_job(job_id) @@ -2372,13 +2380,14 @@ def api_scraper_cancel(job_id): # Set cancelled in DB — the runner loop checks this between phases db.update_scrape_job(job_id, status='cancelled') - # If there's an active subprocess, send SIGTERM - pid = job.get('subprocess_pid') - if pid: - try: - _os.kill(pid, _signal.SIGTERM) - except (ProcessLookupError, PermissionError): - pass # Process already gone + # Stop the Docker container if running + container_name = f'recon-scraper-{job_id}' + try: + import subprocess as _subprocess + _subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) + except Exception: + pass logger.info(f"Scraper job {job_id} cancelled") return jsonify({'ok': True}) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index 280b874..f1e2efd 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -1,27 +1,21 @@ """ RECON Scraper Runner -Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage. -Supports two crawl backends: - - wget (static sites) — default - - SingleFile CLI (JS-rendered sites) — browser mode - -Pre-flight detection automatically chooses the right backend unless -crawl_mode is pre-set on the job. +Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage. +Zimit (openZIM Docker crawler) handles all site types and produces ZIM +files directly — no separate zimwriterfs step needed. Public entry point: scraper_loop(stop_event, config). -Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile) -DB table: scrape_jobs (status flow: pending → scraping → packaging → complete) +Config section: scraper (output_dir, docker_image, docker_workers, poll_interval) +DB table: scrape_jobs (status flow: pending → scraping → registering → complete) """ import glob as _glob -import json as _json import os import re import shutil import signal import subprocess -import tempfile import time from datetime import datetime, timezone from urllib.parse import urlparse @@ -39,6 +33,9 @@ def scraper_loop(stop_event, config): logger.info("Scraper runner started") + # Clean up any orphan Zimit containers from a previous crash + _cleanup_orphan_containers() + while not stop_event.is_set(): db = StatusDB() job = db.get_pending_scrape_job() @@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5): proc.wait(timeout=2) -def _count_html_files(directory): - """Count HTML files in a directory tree.""" - count = 0 - for root, dirs, files in os.walk(directory): - for f in files: - if f.lower().endswith(('.html', '.htm')): - count += 1 - return count - - -def _find_welcome_page(content_dir, domain): - """Find the welcome page (index.html) in the wget mirror.""" - domain_dir = None - for entry in os.listdir(content_dir): - entry_path = os.path.join(content_dir, entry) - if os.path.isdir(entry_path): - domain_dir = entry_path - break - - if not domain_dir: - return None, content_dir - - for candidate in ['index.html', 'index.htm']: - path = os.path.join(domain_dir, candidate) - if os.path.isfile(path): - return candidate, domain_dir - - for root, dirs, files in os.walk(domain_dir): - for f in sorted(files): - if f.lower().endswith(('.html', '.htm')): - rel = os.path.relpath(os.path.join(root, f), domain_dir) - return rel, domain_dir - - return 'index.html', domain_dir - - -def _create_placeholder_illustration(path): - """Create a 48x48 placeholder PNG for zimwriterfs --illustration.""" - from PIL import Image - img = Image.new('RGB', (48, 48), color=(40, 192, 232)) - img.save(path, 'PNG') - - -# ── Crawl mode detection ────────────────────────────────────────── - - -def _get_chromium_path(config): - """Auto-detect Chromium from Playwright's cache, or use config override.""" - configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '') - if configured and os.path.isfile(configured): - return configured - # Playwright stores Chromium — check both root and user caches - search_paths = [ - os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'), - '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome', - ] - for pattern in search_paths: - matches = sorted(_glob.glob(pattern)) - if matches: - return matches[-1] - return None - - -def _detect_crawl_mode(url, config): - """ - Pre-flight detection: determine whether a URL needs a browser to crawl. - - Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'. - 'redirect' means the URL redirected to a different domain (parking page etc.); - resolved_url will be the final browser URL in that case. - """ - preflight_cfg = config.get('scraper', {}).get('preflight', {}) - if not preflight_cfg.get('enabled', True): - return 'static', url - - timeout = preflight_cfg.get('timeout', 30) - min_static = preflight_cfg.get('min_static_size', 5120) - min_browser = preflight_cfg.get('min_browser_size', 20480) - spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next']) - - input_domain = urlparse(url).hostname or '' - if input_domain.startswith('www.'): - input_domain = input_domain[4:] - - # Step 1: wget single-page fetch - wget_html = '' - wget_size = 0 +def _cleanup_orphan_containers(): + """Remove any leftover recon-scraper-* Docker containers from a previous crash.""" try: - with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp: - tmp_path = tmp.name result = subprocess.run( - ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url], - capture_output=True, text=True, timeout=timeout + 5 + ['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'], + capture_output=True, text=True, timeout=10 ) - if os.path.isfile(tmp_path): - wget_size = os.path.getsize(tmp_path) - with open(tmp_path, 'r', errors='replace') as f: - wget_html = f.read() - os.unlink(tmp_path) + if result.returncode == 0 and result.stdout.strip(): + for name in result.stdout.strip().split('\n'): + name = name.strip() + if name: + subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10) + logger.info(f"Cleaned up orphan container: {name}") except Exception as e: - logger.debug(f"Preflight wget failed for {url}: {e}") - try: - os.unlink(tmp_path) - except Exception: - pass - - # Step 2: Playwright headless fetch - browser_html = '' - browser_size = 0 - browser_url = url - try: - from playwright.sync_api import sync_playwright - with sync_playwright() as p: - browser = p.chromium.launch( - headless=True, - args=['--no-sandbox', '--disable-dev-shm-usage'] - ) - page = browser.new_page() - page.goto(url, wait_until='networkidle', timeout=timeout * 1000) - browser_url = page.url - browser_html = page.content() - browser_size = len(browser_html.encode('utf-8')) - browser.close() - except Exception as e: - logger.debug(f"Preflight Playwright failed for {url}: {e}") - # If Playwright fails entirely, fall back to static - return 'static', url - - # Step 3: Decision logic - browser_domain = urlparse(browser_url).hostname or '' - if browser_domain.startswith('www.'): - browser_domain = browser_domain[4:] - - # Check for cross-domain redirect (parking page detection) - if browser_domain and input_domain and browser_domain != input_domain: - logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect") - return 'redirect', browser_url - - # Check size disparity: small wget + large browser = JS-rendered - if wget_size < min_static and browser_size > min_browser: - logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser") - return 'browser', url - - # Check for SPA shell markers in wget HTML - if wget_html: - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(wget_html, 'html.parser') - for marker in spa_markers: - # marker is like 'div#root' — split tag and id - parts = marker.split('#', 1) - tag = parts[0] if parts[0] else 'div' - elem_id = parts[1] if len(parts) > 1 else None - elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag) - if elem: - text_content = elem.get_text(strip=True) - if len(text_content) < 100: - logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser") - return 'browser', url - except Exception as e: - logger.debug(f"Preflight SPA marker check failed: {e}") - - logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static") - return 'static', url + logger.warning(f"Orphan container cleanup failed: {e}") -# ── Crawl backends ──────────────────────────────────────────────── +# ── Zimit crawl backend ────────────────────────────────────────── -def _crawl_wget(job, url, site_dir, config, stop_event, db): +def _crawl_zimit(job, config, stop_event, db): """ - wget mirror crawl backend. - Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. + Crawl a URL using Zimit (openZIM Docker crawler). + + Returns (page_count, zim_filename, error_msg). + On success: (count, filename, None) + On failure: (0, None, error_string) """ job_id = job['id'] + url = job['url'] + title = job.get('title') or _sanitize_domain(url) + language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng') + category = job.get('category') or '' + scraper_cfg = config.get('scraper', {}) - rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5) - user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)') - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) - workspace = os.path.dirname(site_dir) + output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix') + docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit') + docker_workers = scraper_cfg.get('docker_workers', 2) - # Build reject-regex from config defaults + per-job overrides - reject_patterns = [] - skip_defaults = bool(job.get('skip_default_patterns')) - if not skip_defaults: - reject_patterns.extend(scraper_cfg.get('default_reject_patterns', [])) - additional_raw = job.get('additional_reject_patterns') - if additional_raw: - try: - additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw - if isinstance(additional, list): - reject_patterns.extend(additional) - except (ValueError, TypeError): - pass - - wget_cmd = [ - 'wget', '--mirror', '--convert-links', '--adjust-extension', - '--page-requisites', '--no-parent', - '--restrict-file-names=windows', - f'--wait={rate_limit_delay}', '--random-wait', - f'--user-agent={user_agent}', - f'--directory-prefix={site_dir}', - '--timeout=30', '--tries=3', - ] - if reject_patterns: - combined_regex = '|'.join(f'({p})' for p in reject_patterns) - wget_cmd.extend([f'--reject-regex={combined_regex}']) - logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns") - wget_cmd.append(url) - - logger.info(f"Job {job_id}: wget mirror starting") - wget_log = os.path.join(workspace, 'wget.log') - try: - with open(wget_log, 'w') as log_fh: - proc = subprocess.Popen( - wget_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) - db.update_scrape_job(job_id, subprocess_pid=proc.pid) - - while proc.poll() is None: - if stop_event.is_set() or _check_cancelled(db, job_id): - _kill_process(proc) - return 0, 'cancelled' - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - pass - - db.update_scrape_job(job_id, subprocess_pid=None) - - if stop_event.is_set() or _check_cancelled(db, job_id): - return 0, 'cancelled' - - # wget returns 8 for some server errors but may still have useful content - if proc.returncode not in (0, 4, 6, 8): - output = '' - try: - with open(wget_log, 'r') as f: - f.seek(max(0, os.path.getsize(wget_log) - 500)) - output = f.read() - except Exception: - pass - return 0, f"wget failed with code {proc.returncode}: {output[-500:]}" - - except Exception as e: - return 0, f"wget error: {e}" - - page_count = _count_html_files(site_dir) - logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found") - - if page_count == 0: - return 0, 'wget produced no HTML files' - - return page_count, None - - -def _crawl_singlefile(job, url, site_dir, config, stop_event, db): - """ - SingleFile CLI crawl backend for JS-rendered sites. - Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. - """ - job_id = job['id'] - scraper_cfg = config.get('scraper', {}) - sf_cfg = scraper_cfg.get('singlefile', {}) - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) - workspace = os.path.dirname(site_dir) - - executable = sf_cfg.get('executable', 'single-file') - chromium_path = _get_chromium_path(config) - crawl_max_depth = sf_cfg.get('crawl_max_depth', 10) - - if not chromium_path: - return 0, 'Chromium not found — cannot use browser crawl mode' - - # SingleFile outputs into site_dir// to match wget's structure domain = _sanitize_domain(url) - output_dir = os.path.join(site_dir, domain) - os.makedirs(output_dir, exist_ok=True) + date_tag = datetime.now().strftime('%Y-%m') + container_name = f'recon-scraper-{job_id}' + tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}') - sf_cmd = [ - executable, - '--crawl-links=true', - '--crawl-inner-links-only=true', - '--crawl-no-parent=true', - '--crawl-replace-URLs=true', - f'--crawl-max-depth={crawl_max_depth}', - f'--browser-executable-path={chromium_path}', - '--browser-headless=true', - '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', - f'--output-directory={output_dir}', - url, + # Clean up any pre-existing container with same name (retry scenario) + subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10) + + os.makedirs(tmp_dir, exist_ok=True) + + description = f"Mirror of {domain}" + if category: + description = f"{category} — mirror of {domain}" + + docker_cmd = [ + 'docker', 'run', '--rm', + '--name', container_name, + '-v', f'{tmp_dir}:/output', + docker_image, + '--url', url, + '--name', _sanitize_filename(domain), + '--lang', language, + '--title', title, + '--description', description[:80], + '--output', '/output', + '--workers', str(docker_workers), ] - logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})") - sf_log = os.path.join(workspace, 'singlefile.log') + logger.info(f"Job {job_id}: Zimit crawl starting — {url}") try: - with open(sf_log, 'w') as log_fh: - proc = subprocess.Popen( - sf_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) + proc = subprocess.Popen( + docker_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) db.update_scrape_job(job_id, subprocess_pid=proc.pid) + last_progress_check = 0 while proc.poll() is None: if stop_event.is_set() or _check_cancelled(db, job_id): + # Stop the Docker container + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) _kill_process(proc) - return 0, 'cancelled' + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'cancelled' + + # Check progress every 30s via docker logs + now = time.time() + if now - last_progress_check >= 30: + last_progress_check = now + try: + log_result = subprocess.run( + ['docker', 'logs', '--tail', '20', container_name], + capture_output=True, text=True, timeout=10 + ) + if log_result.returncode == 0 and log_result.stderr: + # Zimit/Browsertrix logs page counts — look for numbers + lines = log_result.stderr.strip().split('\n') + for line in reversed(lines): + # Look for patterns like "X pages" or page count indicators + match = re.search(r'(\d+)\s+page', line, re.IGNORECASE) + if match: + count = int(match.group(1)) + if count > 0: + db.update_scrape_job(job_id, page_count=count) + break + except Exception: + pass + try: proc.wait(timeout=5) except subprocess.TimeoutExpired: @@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): db.update_scrape_job(job_id, subprocess_pid=None) if stop_event.is_set() or _check_cancelled(db, job_id): - return 0, 'cancelled' + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'cancelled' if proc.returncode != 0: - output = '' + # Capture last 50 lines of docker logs for error context + error_msg = f"Zimit exited with code {proc.returncode}" try: - with open(sf_log, 'r') as f: - f.seek(max(0, os.path.getsize(sf_log) - 500)) - output = f.read() + log_result = subprocess.run( + ['docker', 'logs', '--tail', '50', container_name], + capture_output=True, text=True, timeout=10 + ) + log_text = (log_result.stderr or log_result.stdout or '').strip() + if log_text: + # Take last 500 chars + error_msg += f": {log_text[-500:]}" except Exception: pass - # SingleFile may still produce some files even with non-zero exit - page_count = _count_html_files(site_dir) - if page_count == 0: - return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}" - logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing") + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, error_msg except Exception as e: - return 0, f"SingleFile error: {e}" + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, f"Zimit error: {e}" - # If no index.html exists, rename the first HTML file to index.html - index_path = os.path.join(output_dir, 'index.html') - if not os.path.isfile(index_path): - for f in sorted(os.listdir(output_dir)): - if f.lower().endswith(('.html', '.htm')): - src = os.path.join(output_dir, f) - os.rename(src, index_path) - logger.info(f"Job {job_id}: renamed {f} → index.html") - break + # Find the output ZIM file + zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim')) + if not zim_files: + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'Zimit produced no ZIM file' - page_count = _count_html_files(site_dir) - logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found") + src_zim = zim_files[0] # Should be exactly one - if page_count == 0: - return 0, 'SingleFile produced no HTML files' + # Get page count from file size as rough estimate if we don't have one + page_count = 0 + try: + job_state = db.get_scrape_job(job_id) + page_count = job_state.get('page_count') or 0 + except Exception: + pass - return page_count, None + # Rename to final location + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" + zim_path = os.path.join(output_dir, zim_filename) + try: + shutil.move(src_zim, zim_path) + except Exception as e: + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, f"Failed to move ZIM to output dir: {e}" + + shutil.rmtree(tmp_dir, ignore_errors=True) + logger.info(f"Job {job_id}: Zimit complete — {zim_filename}") + + return page_count, zim_filename, None # ── Main job pipeline ───────────────────────────────────────────── @@ -458,183 +273,43 @@ def _process_job(job, config, stop_event): """Execute the full scrape pipeline for a single job.""" db = StatusDB() job_id = job['id'] - url = job['url'] - title = job.get('title') or _sanitize_domain(url) - language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng') - category = job.get('category') or '' - scraper_cfg = config.get('scraper', {}) - workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper') - output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix') - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) + logger.info(f"Job {job_id}: starting scrape of {job['url']}") - workspace = os.path.join(workspace_root, str(job_id)) - site_dir = os.path.join(workspace, 'site') - os.makedirs(site_dir, exist_ok=True) - - domain = _sanitize_domain(url) - date_tag = datetime.now().strftime('%Y-%m') - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" - zim_path = os.path.join(output_dir, zim_filename) - - logger.info(f"Job {job_id}: starting scrape of {url}") + # ── Phase 1: Crawl via Zimit ─────────────────────────────────── db.update_scrape_job(job_id, status='scraping', - workspace_path=workspace, + crawl_mode='zimit', started_at=_now()) - # ── Phase 0: Pre-flight mode detection ───────────────────────── if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return - pre_set = job.get('crawl_mode') - if pre_set: - crawl_mode, resolved_url = pre_set, url - logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}") - else: - crawl_mode, resolved_url = _detect_crawl_mode(url, config) - logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}") - - db.update_scrape_job(job_id, crawl_mode=crawl_mode) - - # If redirect detected, update domain/filename to match resolved URL - if crawl_mode == 'redirect' and resolved_url != url: - logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}") - domain = _sanitize_domain(resolved_url) - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" - zim_path = os.path.join(output_dir, zim_filename) - - # ── Phase A: Crawl (dispatch to backend) ──────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - if crawl_mode == 'browser': - page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db) - else: # 'static' or 'redirect' - page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db) + page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db) if error == 'cancelled': - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return elif error: db.update_scrape_job(job_id, status='failed', - error_message=error, + error_message=error[:1000], subprocess_pid=None, completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) return db.update_scrape_job(job_id, page_count=page_count) - # ── Phase B: Prepare zimwriterfs inputs ──────────────────────── + # ── Phase 2: Register with kiwix-serve ───────────────────────── if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return - welcome_page, content_dir = _find_welcome_page(site_dir, domain) - if welcome_page is None: - welcome_page = 'index.html' - - illustration_path = os.path.join(workspace, 'illustration.png') - _create_placeholder_illustration(illustration_path) - illust_dest = os.path.join(content_dir, 'illustration.png') - shutil.copy2(illustration_path, illust_dest) - - description = f"Mirror of {domain}" - if category: - description = f"{category} — mirror of {domain}" - - logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})") - db.update_scrape_job(job_id, status='packaging') - - # ── Phase C: zimwriterfs ─────────────────────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - zim_name = _sanitize_filename(domain) - long_description = f"Offline mirror of {resolved_url} created by RECON web scraper" - - zim_cmd = [ - 'zimwriterfs', - f'--welcome={welcome_page}', - f'--illustration=illustration.png', - f'--language={language}', - f'--title={title}', - f'--description={description[:80]}', - f'--longDescription={long_description[:4096]}', - f'--name={zim_name}', - f'--creator={domain}', - '--publisher=RECON', - content_dir, - zim_path, - ] - - zim_log = os.path.join(workspace, 'zimwriterfs.log') - try: - with open(zim_log, 'w') as log_fh: - proc = subprocess.Popen( - zim_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) - db.update_scrape_job(job_id, subprocess_pid=proc.pid) - - while proc.poll() is None: - if stop_event.is_set() or _check_cancelled(db, job_id): - _kill_process(proc) - _handle_cancel(db, job_id, workspace, keep_workspace) - return - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - pass - - db.update_scrape_job(job_id, subprocess_pid=None) - - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - if proc.returncode != 0: - output = '' - try: - with open(zim_log, 'r') as f: - f.seek(max(0, os.path.getsize(zim_log) - 500)) - output = f.read() - except Exception: - pass - raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}") - - except RuntimeError: - raise - except Exception as e: - db.update_scrape_job(job_id, - status='failed', - error_message=f"zimwriterfs error: {e}", - subprocess_pid=None, - completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) - return - - if not os.path.isfile(zim_path): - db.update_scrape_job(job_id, - status='failed', - error_message='zimwriterfs produced no output file', - completed_at=_now()) - return - - logger.info(f"Job {job_id}: ZIM created at {zim_path}") - - # ── Phase D: kiwix-manage + registration ─────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return + db.update_scrape_job(job_id, status='registering') + output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix') + zim_path = os.path.join(output_dir, zim_filename) kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage' library_xml = '/mnt/kiwix/library.xml' @@ -670,26 +345,32 @@ def _process_job(job, config, stop_event): except Exception as e: logger.warning(f"Job {job_id}: scan_zims failed: {e}") - try: - shutil.rmtree(workspace, ignore_errors=True) - except Exception: - pass - + # ── Phase 3: Complete ────────────────────────────────────────── db.update_scrape_job(job_id, status='complete', zim_filename=zim_filename, zim_source_id=zim_source_id, completed_at=_now()) - logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})") + logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)") -def _handle_cancel(db, job_id, workspace, keep_workspace): - """Handle job cancellation: clean up and update status.""" +def _handle_cancel(db, job_id): + """Handle job cancellation: clean up Docker container and update status.""" + container_name = f'recon-scraper-{job_id}' + try: + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) + except Exception: + pass + + # Clean up tmp dir if it exists + output_dir = '/mnt/kiwix' + tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}') + shutil.rmtree(tmp_dir, ignore_errors=True) + logger.info(f"Job {job_id}: cancelled") db.update_scrape_job(job_id, status='cancelled', subprocess_pid=None, completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) diff --git a/static/js/scraper.js b/static/js/scraper.js index 49ce178..3988ffe 100644 --- a/static/js/scraper.js +++ b/static/js/scraper.js @@ -12,7 +12,7 @@ jobs.forEach(function(j) { if (j.status === 'complete') complete++; else if (j.status === 'failed' || j.status === 'cancelled') failed++; - else if (j.status === 'running' || j.status === 'pending') active++; + else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++; }); RECON.set('sc-total', RECON.fmt(total)); RECON.set('sc-active', RECON.fmt(active)); @@ -27,14 +27,12 @@ var html = ''; jobs.forEach(function(j) { var badge = statusBadge(j.status); - var mode = j.crawl_mode ? - '' + j.crawl_mode + '' : '\u2014'; var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014'; var zim = j.zim_filename ? '' + j.zim_filename + '' : '\u2014'; var actions = ''; - if (j.status === 'running' || j.status === 'pending') { + if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') { actions = ''; } else if (j.status === 'failed' || j.status === 'cancelled') { actions = ' ' + @@ -50,14 +48,13 @@ '' + '' + '' + - '' + '' + '' + '' + '' + ''; }); - if (!html) html = ''; + if (!html) html = ''; RECON.setHTML('sc-table-body', html); }).catch(function(err) { console.error('Scraper dashboard error:', err); @@ -67,7 +64,8 @@ function statusBadge(status) { var map = { 'pending': 'PENDING', - 'running': 'RUNNING', + 'scraping': 'SCRAPING', + 'registering': 'REGISTERING', 'complete': 'COMPLETE', 'failed': 'FAILED', 'cancelled': 'CANCELLED' @@ -98,12 +96,9 @@ var title = document.getElementById('sf-title').value.trim(); var lang = document.getElementById('sf-lang').value; var category = document.getElementById('sf-category').value.trim(); - var mode = document.getElementById('sf-mode').value; - if (title) body.title = title; if (lang) body.language = lang; if (category) body.category = category; - if (mode) body.crawl_mode = mode; var btn = document.getElementById('sf-submit-btn'); var feedback = document.getElementById('sf-feedback'); diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html index 3c42f43..862ba0a 100644 --- a/templates/kiwix/scraper.html +++ b/templates/kiwix/scraper.html @@ -17,7 +17,7 @@ style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;"> -
+
- - - - -
@@ -75,7 +66,6 @@
- @@ -83,7 +73,7 @@ - +
' + j.id + '' + escHtml(displayUrl) + '' + escHtml(j.title || '\u2014') + '' + mode + '' + pages + '' + badge + errorTooltip(j) + '' + zim + '' + actions + '
No scrape jobs
No scrape jobs
ID URL TitleMode Pages Status ZIM
Loading...
Loading...
From 76076fc4ab87d6c7dec7bde564435b703b82cd7c Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 14:13:34 +0000 Subject: [PATCH 08/11] Fix Zimit CLI: add subcommand, correct flag names, fix container cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Must pass `zimit` as command after image name (entrypoint execs args) - --url → --seeds, --name removed, --lang → --zim-lang, --workers → -w - Remove --rm so docker logs work after exit, manually rm container Co-Authored-By: Claude Opus 4.6 --- lib/scraper_runner.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index f1e2efd..9658be5 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -148,17 +148,17 @@ def _crawl_zimit(job, config, stop_event, db): description = f"{category} — mirror of {domain}" docker_cmd = [ - 'docker', 'run', '--rm', + 'docker', 'run', '--name', container_name, '-v', f'{tmp_dir}:/output', docker_image, - '--url', url, - '--name', _sanitize_filename(domain), - '--lang', language, + 'zimit', + '--seeds', url, + '--zim-lang', language, '--title', title, '--description', description[:80], '--output', '/output', - '--workers', str(docker_workers), + '-w', str(docker_workers), ] logger.info(f"Job {job_id}: Zimit crawl starting — {url}") @@ -228,6 +228,9 @@ def _crawl_zimit(job, config, stop_event, db): error_msg += f": {log_text[-500:]}" except Exception: pass + # Remove container (no --rm flag, so we clean up manually) + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) shutil.rmtree(tmp_dir, ignore_errors=True) return 0, None, error_msg @@ -235,6 +238,10 @@ def _crawl_zimit(job, config, stop_event, db): shutil.rmtree(tmp_dir, ignore_errors=True) return 0, None, f"Zimit error: {e}" + # Remove container (no --rm flag, so we clean up manually after getting logs) + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) + # Find the output ZIM file zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim')) if not zim_files: From b035ba3f203b3167259415c6261af0b555859ff7 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 14:30:42 +0000 Subject: [PATCH 09/11] Fix Zimit: add required --name flag for warc2zim warc2zim (called internally by zimit) requires --name for ZIM metadata. Without it, argument validation fails with exit code 2. Co-Authored-By: Claude Opus 4.6 --- lib/scraper_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index 9658be5..eb50695 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -154,6 +154,7 @@ def _crawl_zimit(job, config, stop_event, db): docker_image, 'zimit', '--seeds', url, + '--name', _sanitize_filename(domain), '--zim-lang', language, '--title', title, '--description', description[:80], From 96920447900037310efcc0a30cb45d1a1c7effe5 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 19:33:50 +0000 Subject: [PATCH 10/11] Fix progress parsing for Browsertrix JSON log format Parse "crawled":N from Browsertrix crawlStatus JSON logs instead of looking for "N pages" pattern. Also check stdout (not just stderr). Co-Authored-By: Claude Opus 4.6 --- lib/scraper_runner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index eb50695..d6b0299 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -190,12 +190,12 @@ def _crawl_zimit(job, config, stop_event, db): ['docker', 'logs', '--tail', '20', container_name], capture_output=True, text=True, timeout=10 ) - if log_result.returncode == 0 and log_result.stderr: - # Zimit/Browsertrix logs page counts — look for numbers - lines = log_result.stderr.strip().split('\n') + if log_result.returncode == 0: + # Browsertrix logs JSON with "crawled":N — check both stdout and stderr + log_text = log_result.stdout or log_result.stderr or '' + lines = log_text.strip().split('\n') for line in reversed(lines): - # Look for patterns like "X pages" or page count indicators - match = re.search(r'(\d+)\s+page', line, re.IGNORECASE) + match = re.search(r'"crawled":(\d+)', line) if match: count = int(match.group(1)) if count > 0: From 5f5bcedab986b7b93b42d30e325feaca7a5ba214 Mon Sep 17 00:00:00 2001 From: Matt Date: Sun, 19 Apr 2026 19:35:42 +0000 Subject: [PATCH 11/11] Fix progress regex and SIGHUP/scan_zims race condition - Parse Browsertrix "crawled":N JSON format instead of "N pages" - Add 3s delay between SIGHUP to kiwix-serve and scan_zims() call so the OPDS catalog is reloaded before we query it for linking Co-Authored-By: Claude Opus 4.6 --- lib/scraper_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index d6b0299..b83f145 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -339,6 +339,9 @@ def _process_job(job, config, stop_event): except Exception as e: logger.warning(f"Job {job_id}: failed to signal kiwix-serve: {e}") + # Wait for kiwix-serve to reload its catalog after SIGHUP + time.sleep(3) + zim_source_id = None try: from .zim_monitor import scan_zims