diff --git a/config.yaml b/config.yaml index 082be93..a2709b0 100644 --- a/config.yaml +++ b/config.yaml @@ -414,81 +414,12 @@ peertube: poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min) scraper: - workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds + workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output) output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library) - rate_limit_delay: 0.5 # Seconds between wget requests (--wait) - wait_random: 1.0 # Random jitter added to wait (--random-wait range) default_language: eng # ISO 639-3 language code for ZIM metadata - user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)" poll_interval: 300 # Seconds between checking for pending scrape jobs - keep_workspace_on_failure: true # Retain workspace for debugging when a job fails - - # Default URL patterns rejected by wget --reject-regex. - # Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc. - # Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass). - default_reject_patterns: - # WordPress - - '\?share=' - - '\?replytocom=' - - '\?like_comment=' - - '/feed/' - - '/wp-json/' - - '/wp-login' - - '/wp-admin' - - '/wp-cron' - - '\?attachment_id=' - - '/xmlrpc' - - '/trackback' - - '/comment-page-' - - '\?doing_wp_cron' - # Squarespace - - '\?format=json' - - '\?format=rss' - - '/api/' - # Wix - - '/_api/' - - '/_partials/' - # Ghost - - '/ghost/' - - '/p/' - # Drupal - - '\?q=comment' - - '\?q=node' - - '/user/login' - - '/user/register' - # General CMS / site chrome - - '/login' - - '/signup' - - '/register' - - '/cart' - - '/checkout' - - '/search\?' - - '/tag/' - - '/author/' - - '\?print=' - - '\?pdf=' - - '\?format=amp' - - '\?preview=' - - '/rss' - - '/atom' - - '/cdn-cgi/' - - # Pre-flight mode detection - preflight: - enabled: true - timeout: 30 # Seconds for single-page Playwright fetch - min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site - min_browser_size: 20480 # 20KB - browser HTML above this confirms JS - spa_markers: - - 'div#root' - - 'div#app' - - 'div#__next' - - # SingleFile CLI settings (browser crawl mode) - singlefile: - executable: single-file - chromium_path: "/usr/bin/chromium-browser" - crawl_max_depth: 10 + docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling + docker_workers: 2 # Concurrent crawl workers inside Zimit container # Stream B: New Library Pipeline new_pipeline: diff --git a/lib/api.py b/lib/api.py index b5cb8b5..6a3d627 100644 --- a/lib/api.py +++ b/lib/api.py @@ -44,6 +44,20 @@ app = Flask(__name__, app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB + +# ── Large ZIM upload support ── +# Override stream factory so ZIM uploads write directly to /mnt/kiwix/ +# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs). +from flask import Request as _FlaskRequest + +class _LargeZimRequest(_FlaskRequest): + def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None): + if filename and filename.lower().endswith('.zim'): + return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False) + return super()._get_file_stream(total_content_length, content_type, filename, content_length) + +app.request_class = _LargeZimRequest + # ── Navigation Constants ── KNOWLEDGE_SUBNAV = [ @@ -2020,14 +2034,23 @@ def api_kiwix_upload(): filename = secure_filename(f.filename) dest = os.path.join('/mnt/kiwix', filename) - tmp_dest = dest + '.tmp' try: - f.save(tmp_dest) - os.rename(tmp_dest, dest) + # Stream was written directly to /mnt/kiwix/ by _LargeZimRequest — + # rename in-place instead of copying 100GB+ through f.save() + if hasattr(f.stream, 'name') and f.stream.name: + tmp_path = f.stream.name + f.stream.close() + os.rename(tmp_path, dest) + else: + tmp_dest = dest + '.tmp' + f.save(tmp_dest) + os.rename(tmp_dest, dest) except Exception as e: - if os.path.exists(tmp_dest): - os.remove(tmp_dest) + # Clean up any temp files on failure + for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]: + if p and os.path.exists(p): + os.remove(p) return jsonify({'error': f'Save failed: {e}'}), 500 # Register with kiwix-serve library @@ -2320,24 +2343,11 @@ def api_scraper_submit(): title = data.get('title', '').strip() or None category = data.get('category', '').strip() or None - # Optional per-job reject pattern overrides - additional_reject_patterns = data.get('additional_reject_patterns') - skip_default_patterns = bool(data.get('skip_default_patterns', False)) - - # Optional crawl mode override (static, browser, redirect, or null for auto-detect) - crawl_mode = data.get('crawl_mode') - if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'): - return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400 - - # Serialize additional patterns as JSON if provided - import json as _json - additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None - db = StatusDB() conn = db._get_conn() conn.execute( - "INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)", - (url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode) + "INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)", + (url, title, language, category, 'zimit') ) conn.commit() job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] @@ -2358,8 +2368,6 @@ def api_scraper_jobs(): @app.route('/api/scraper/cancel/', methods=['POST']) def api_scraper_cancel(job_id): """Cancel a scrape job.""" - import os as _os - import signal as _signal db = StatusDB() job = db.get_scrape_job(job_id) @@ -2372,13 +2380,14 @@ def api_scraper_cancel(job_id): # Set cancelled in DB — the runner loop checks this between phases db.update_scrape_job(job_id, status='cancelled') - # If there's an active subprocess, send SIGTERM - pid = job.get('subprocess_pid') - if pid: - try: - _os.kill(pid, _signal.SIGTERM) - except (ProcessLookupError, PermissionError): - pass # Process already gone + # Stop the Docker container if running + container_name = f'recon-scraper-{job_id}' + try: + import subprocess as _subprocess + _subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) + except Exception: + pass logger.info(f"Scraper job {job_id} cancelled") return jsonify({'ok': True}) diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index 280b874..f1e2efd 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -1,27 +1,21 @@ """ RECON Scraper Runner -Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage. -Supports two crawl backends: - - wget (static sites) — default - - SingleFile CLI (JS-rendered sites) — browser mode - -Pre-flight detection automatically chooses the right backend unless -crawl_mode is pre-set on the job. +Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage. +Zimit (openZIM Docker crawler) handles all site types and produces ZIM +files directly — no separate zimwriterfs step needed. Public entry point: scraper_loop(stop_event, config). -Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile) -DB table: scrape_jobs (status flow: pending → scraping → packaging → complete) +Config section: scraper (output_dir, docker_image, docker_workers, poll_interval) +DB table: scrape_jobs (status flow: pending → scraping → registering → complete) """ import glob as _glob -import json as _json import os import re import shutil import signal import subprocess -import tempfile import time from datetime import datetime, timezone from urllib.parse import urlparse @@ -39,6 +33,9 @@ def scraper_loop(stop_event, config): logger.info("Scraper runner started") + # Clean up any orphan Zimit containers from a previous crash + _cleanup_orphan_containers() + while not stop_event.is_set(): db = StatusDB() job = db.get_pending_scrape_job() @@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5): proc.wait(timeout=2) -def _count_html_files(directory): - """Count HTML files in a directory tree.""" - count = 0 - for root, dirs, files in os.walk(directory): - for f in files: - if f.lower().endswith(('.html', '.htm')): - count += 1 - return count - - -def _find_welcome_page(content_dir, domain): - """Find the welcome page (index.html) in the wget mirror.""" - domain_dir = None - for entry in os.listdir(content_dir): - entry_path = os.path.join(content_dir, entry) - if os.path.isdir(entry_path): - domain_dir = entry_path - break - - if not domain_dir: - return None, content_dir - - for candidate in ['index.html', 'index.htm']: - path = os.path.join(domain_dir, candidate) - if os.path.isfile(path): - return candidate, domain_dir - - for root, dirs, files in os.walk(domain_dir): - for f in sorted(files): - if f.lower().endswith(('.html', '.htm')): - rel = os.path.relpath(os.path.join(root, f), domain_dir) - return rel, domain_dir - - return 'index.html', domain_dir - - -def _create_placeholder_illustration(path): - """Create a 48x48 placeholder PNG for zimwriterfs --illustration.""" - from PIL import Image - img = Image.new('RGB', (48, 48), color=(40, 192, 232)) - img.save(path, 'PNG') - - -# ── Crawl mode detection ────────────────────────────────────────── - - -def _get_chromium_path(config): - """Auto-detect Chromium from Playwright's cache, or use config override.""" - configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '') - if configured and os.path.isfile(configured): - return configured - # Playwright stores Chromium — check both root and user caches - search_paths = [ - os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'), - '/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome', - ] - for pattern in search_paths: - matches = sorted(_glob.glob(pattern)) - if matches: - return matches[-1] - return None - - -def _detect_crawl_mode(url, config): - """ - Pre-flight detection: determine whether a URL needs a browser to crawl. - - Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'. - 'redirect' means the URL redirected to a different domain (parking page etc.); - resolved_url will be the final browser URL in that case. - """ - preflight_cfg = config.get('scraper', {}).get('preflight', {}) - if not preflight_cfg.get('enabled', True): - return 'static', url - - timeout = preflight_cfg.get('timeout', 30) - min_static = preflight_cfg.get('min_static_size', 5120) - min_browser = preflight_cfg.get('min_browser_size', 20480) - spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next']) - - input_domain = urlparse(url).hostname or '' - if input_domain.startswith('www.'): - input_domain = input_domain[4:] - - # Step 1: wget single-page fetch - wget_html = '' - wget_size = 0 +def _cleanup_orphan_containers(): + """Remove any leftover recon-scraper-* Docker containers from a previous crash.""" try: - with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp: - tmp_path = tmp.name result = subprocess.run( - ['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url], - capture_output=True, text=True, timeout=timeout + 5 + ['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'], + capture_output=True, text=True, timeout=10 ) - if os.path.isfile(tmp_path): - wget_size = os.path.getsize(tmp_path) - with open(tmp_path, 'r', errors='replace') as f: - wget_html = f.read() - os.unlink(tmp_path) + if result.returncode == 0 and result.stdout.strip(): + for name in result.stdout.strip().split('\n'): + name = name.strip() + if name: + subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10) + logger.info(f"Cleaned up orphan container: {name}") except Exception as e: - logger.debug(f"Preflight wget failed for {url}: {e}") - try: - os.unlink(tmp_path) - except Exception: - pass - - # Step 2: Playwright headless fetch - browser_html = '' - browser_size = 0 - browser_url = url - try: - from playwright.sync_api import sync_playwright - with sync_playwright() as p: - browser = p.chromium.launch( - headless=True, - args=['--no-sandbox', '--disable-dev-shm-usage'] - ) - page = browser.new_page() - page.goto(url, wait_until='networkidle', timeout=timeout * 1000) - browser_url = page.url - browser_html = page.content() - browser_size = len(browser_html.encode('utf-8')) - browser.close() - except Exception as e: - logger.debug(f"Preflight Playwright failed for {url}: {e}") - # If Playwright fails entirely, fall back to static - return 'static', url - - # Step 3: Decision logic - browser_domain = urlparse(browser_url).hostname or '' - if browser_domain.startswith('www.'): - browser_domain = browser_domain[4:] - - # Check for cross-domain redirect (parking page detection) - if browser_domain and input_domain and browser_domain != input_domain: - logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect") - return 'redirect', browser_url - - # Check size disparity: small wget + large browser = JS-rendered - if wget_size < min_static and browser_size > min_browser: - logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser") - return 'browser', url - - # Check for SPA shell markers in wget HTML - if wget_html: - try: - from bs4 import BeautifulSoup - soup = BeautifulSoup(wget_html, 'html.parser') - for marker in spa_markers: - # marker is like 'div#root' — split tag and id - parts = marker.split('#', 1) - tag = parts[0] if parts[0] else 'div' - elem_id = parts[1] if len(parts) > 1 else None - elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag) - if elem: - text_content = elem.get_text(strip=True) - if len(text_content) < 100: - logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser") - return 'browser', url - except Exception as e: - logger.debug(f"Preflight SPA marker check failed: {e}") - - logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static") - return 'static', url + logger.warning(f"Orphan container cleanup failed: {e}") -# ── Crawl backends ──────────────────────────────────────────────── +# ── Zimit crawl backend ────────────────────────────────────────── -def _crawl_wget(job, url, site_dir, config, stop_event, db): +def _crawl_zimit(job, config, stop_event, db): """ - wget mirror crawl backend. - Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. + Crawl a URL using Zimit (openZIM Docker crawler). + + Returns (page_count, zim_filename, error_msg). + On success: (count, filename, None) + On failure: (0, None, error_string) """ job_id = job['id'] + url = job['url'] + title = job.get('title') or _sanitize_domain(url) + language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng') + category = job.get('category') or '' + scraper_cfg = config.get('scraper', {}) - rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5) - user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)') - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) - workspace = os.path.dirname(site_dir) + output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix') + docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit') + docker_workers = scraper_cfg.get('docker_workers', 2) - # Build reject-regex from config defaults + per-job overrides - reject_patterns = [] - skip_defaults = bool(job.get('skip_default_patterns')) - if not skip_defaults: - reject_patterns.extend(scraper_cfg.get('default_reject_patterns', [])) - additional_raw = job.get('additional_reject_patterns') - if additional_raw: - try: - additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw - if isinstance(additional, list): - reject_patterns.extend(additional) - except (ValueError, TypeError): - pass - - wget_cmd = [ - 'wget', '--mirror', '--convert-links', '--adjust-extension', - '--page-requisites', '--no-parent', - '--restrict-file-names=windows', - f'--wait={rate_limit_delay}', '--random-wait', - f'--user-agent={user_agent}', - f'--directory-prefix={site_dir}', - '--timeout=30', '--tries=3', - ] - if reject_patterns: - combined_regex = '|'.join(f'({p})' for p in reject_patterns) - wget_cmd.extend([f'--reject-regex={combined_regex}']) - logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns") - wget_cmd.append(url) - - logger.info(f"Job {job_id}: wget mirror starting") - wget_log = os.path.join(workspace, 'wget.log') - try: - with open(wget_log, 'w') as log_fh: - proc = subprocess.Popen( - wget_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) - db.update_scrape_job(job_id, subprocess_pid=proc.pid) - - while proc.poll() is None: - if stop_event.is_set() or _check_cancelled(db, job_id): - _kill_process(proc) - return 0, 'cancelled' - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - pass - - db.update_scrape_job(job_id, subprocess_pid=None) - - if stop_event.is_set() or _check_cancelled(db, job_id): - return 0, 'cancelled' - - # wget returns 8 for some server errors but may still have useful content - if proc.returncode not in (0, 4, 6, 8): - output = '' - try: - with open(wget_log, 'r') as f: - f.seek(max(0, os.path.getsize(wget_log) - 500)) - output = f.read() - except Exception: - pass - return 0, f"wget failed with code {proc.returncode}: {output[-500:]}" - - except Exception as e: - return 0, f"wget error: {e}" - - page_count = _count_html_files(site_dir) - logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found") - - if page_count == 0: - return 0, 'wget produced no HTML files' - - return page_count, None - - -def _crawl_singlefile(job, url, site_dir, config, stop_event, db): - """ - SingleFile CLI crawl backend for JS-rendered sites. - Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel. - """ - job_id = job['id'] - scraper_cfg = config.get('scraper', {}) - sf_cfg = scraper_cfg.get('singlefile', {}) - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) - workspace = os.path.dirname(site_dir) - - executable = sf_cfg.get('executable', 'single-file') - chromium_path = _get_chromium_path(config) - crawl_max_depth = sf_cfg.get('crawl_max_depth', 10) - - if not chromium_path: - return 0, 'Chromium not found — cannot use browser crawl mode' - - # SingleFile outputs into site_dir// to match wget's structure domain = _sanitize_domain(url) - output_dir = os.path.join(site_dir, domain) - os.makedirs(output_dir, exist_ok=True) + date_tag = datetime.now().strftime('%Y-%m') + container_name = f'recon-scraper-{job_id}' + tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}') - sf_cmd = [ - executable, - '--crawl-links=true', - '--crawl-inner-links-only=true', - '--crawl-no-parent=true', - '--crawl-replace-URLs=true', - f'--crawl-max-depth={crawl_max_depth}', - f'--browser-executable-path={chromium_path}', - '--browser-headless=true', - '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', - f'--output-directory={output_dir}', - url, + # Clean up any pre-existing container with same name (retry scenario) + subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10) + + os.makedirs(tmp_dir, exist_ok=True) + + description = f"Mirror of {domain}" + if category: + description = f"{category} — mirror of {domain}" + + docker_cmd = [ + 'docker', 'run', '--rm', + '--name', container_name, + '-v', f'{tmp_dir}:/output', + docker_image, + '--url', url, + '--name', _sanitize_filename(domain), + '--lang', language, + '--title', title, + '--description', description[:80], + '--output', '/output', + '--workers', str(docker_workers), ] - logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})") - sf_log = os.path.join(workspace, 'singlefile.log') + logger.info(f"Job {job_id}: Zimit crawl starting — {url}") try: - with open(sf_log, 'w') as log_fh: - proc = subprocess.Popen( - sf_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) + proc = subprocess.Popen( + docker_cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) db.update_scrape_job(job_id, subprocess_pid=proc.pid) + last_progress_check = 0 while proc.poll() is None: if stop_event.is_set() or _check_cancelled(db, job_id): + # Stop the Docker container + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) _kill_process(proc) - return 0, 'cancelled' + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'cancelled' + + # Check progress every 30s via docker logs + now = time.time() + if now - last_progress_check >= 30: + last_progress_check = now + try: + log_result = subprocess.run( + ['docker', 'logs', '--tail', '20', container_name], + capture_output=True, text=True, timeout=10 + ) + if log_result.returncode == 0 and log_result.stderr: + # Zimit/Browsertrix logs page counts — look for numbers + lines = log_result.stderr.strip().split('\n') + for line in reversed(lines): + # Look for patterns like "X pages" or page count indicators + match = re.search(r'(\d+)\s+page', line, re.IGNORECASE) + if match: + count = int(match.group(1)) + if count > 0: + db.update_scrape_job(job_id, page_count=count) + break + except Exception: + pass + try: proc.wait(timeout=5) except subprocess.TimeoutExpired: @@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): db.update_scrape_job(job_id, subprocess_pid=None) if stop_event.is_set() or _check_cancelled(db, job_id): - return 0, 'cancelled' + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'cancelled' if proc.returncode != 0: - output = '' + # Capture last 50 lines of docker logs for error context + error_msg = f"Zimit exited with code {proc.returncode}" try: - with open(sf_log, 'r') as f: - f.seek(max(0, os.path.getsize(sf_log) - 500)) - output = f.read() + log_result = subprocess.run( + ['docker', 'logs', '--tail', '50', container_name], + capture_output=True, text=True, timeout=10 + ) + log_text = (log_result.stderr or log_result.stdout or '').strip() + if log_text: + # Take last 500 chars + error_msg += f": {log_text[-500:]}" except Exception: pass - # SingleFile may still produce some files even with non-zero exit - page_count = _count_html_files(site_dir) - if page_count == 0: - return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}" - logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing") + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, error_msg except Exception as e: - return 0, f"SingleFile error: {e}" + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, f"Zimit error: {e}" - # If no index.html exists, rename the first HTML file to index.html - index_path = os.path.join(output_dir, 'index.html') - if not os.path.isfile(index_path): - for f in sorted(os.listdir(output_dir)): - if f.lower().endswith(('.html', '.htm')): - src = os.path.join(output_dir, f) - os.rename(src, index_path) - logger.info(f"Job {job_id}: renamed {f} → index.html") - break + # Find the output ZIM file + zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim')) + if not zim_files: + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, 'Zimit produced no ZIM file' - page_count = _count_html_files(site_dir) - logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found") + src_zim = zim_files[0] # Should be exactly one - if page_count == 0: - return 0, 'SingleFile produced no HTML files' + # Get page count from file size as rough estimate if we don't have one + page_count = 0 + try: + job_state = db.get_scrape_job(job_id) + page_count = job_state.get('page_count') or 0 + except Exception: + pass - return page_count, None + # Rename to final location + zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" + zim_path = os.path.join(output_dir, zim_filename) + try: + shutil.move(src_zim, zim_path) + except Exception as e: + shutil.rmtree(tmp_dir, ignore_errors=True) + return 0, None, f"Failed to move ZIM to output dir: {e}" + + shutil.rmtree(tmp_dir, ignore_errors=True) + logger.info(f"Job {job_id}: Zimit complete — {zim_filename}") + + return page_count, zim_filename, None # ── Main job pipeline ───────────────────────────────────────────── @@ -458,183 +273,43 @@ def _process_job(job, config, stop_event): """Execute the full scrape pipeline for a single job.""" db = StatusDB() job_id = job['id'] - url = job['url'] - title = job.get('title') or _sanitize_domain(url) - language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng') - category = job.get('category') or '' - scraper_cfg = config.get('scraper', {}) - workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper') - output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix') - keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) + logger.info(f"Job {job_id}: starting scrape of {job['url']}") - workspace = os.path.join(workspace_root, str(job_id)) - site_dir = os.path.join(workspace, 'site') - os.makedirs(site_dir, exist_ok=True) - - domain = _sanitize_domain(url) - date_tag = datetime.now().strftime('%Y-%m') - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" - zim_path = os.path.join(output_dir, zim_filename) - - logger.info(f"Job {job_id}: starting scrape of {url}") + # ── Phase 1: Crawl via Zimit ─────────────────────────────────── db.update_scrape_job(job_id, status='scraping', - workspace_path=workspace, + crawl_mode='zimit', started_at=_now()) - # ── Phase 0: Pre-flight mode detection ───────────────────────── if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return - pre_set = job.get('crawl_mode') - if pre_set: - crawl_mode, resolved_url = pre_set, url - logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}") - else: - crawl_mode, resolved_url = _detect_crawl_mode(url, config) - logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}") - - db.update_scrape_job(job_id, crawl_mode=crawl_mode) - - # If redirect detected, update domain/filename to match resolved URL - if crawl_mode == 'redirect' and resolved_url != url: - logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}") - domain = _sanitize_domain(resolved_url) - zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim" - zim_path = os.path.join(output_dir, zim_filename) - - # ── Phase A: Crawl (dispatch to backend) ──────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - if crawl_mode == 'browser': - page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db) - else: # 'static' or 'redirect' - page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db) + page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db) if error == 'cancelled': - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return elif error: db.update_scrape_job(job_id, status='failed', - error_message=error, + error_message=error[:1000], subprocess_pid=None, completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) return db.update_scrape_job(job_id, page_count=page_count) - # ── Phase B: Prepare zimwriterfs inputs ──────────────────────── + # ── Phase 2: Register with kiwix-serve ───────────────────────── if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) + _handle_cancel(db, job_id) return - welcome_page, content_dir = _find_welcome_page(site_dir, domain) - if welcome_page is None: - welcome_page = 'index.html' - - illustration_path = os.path.join(workspace, 'illustration.png') - _create_placeholder_illustration(illustration_path) - illust_dest = os.path.join(content_dir, 'illustration.png') - shutil.copy2(illustration_path, illust_dest) - - description = f"Mirror of {domain}" - if category: - description = f"{category} — mirror of {domain}" - - logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})") - db.update_scrape_job(job_id, status='packaging') - - # ── Phase C: zimwriterfs ─────────────────────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - zim_name = _sanitize_filename(domain) - long_description = f"Offline mirror of {resolved_url} created by RECON web scraper" - - zim_cmd = [ - 'zimwriterfs', - f'--welcome={welcome_page}', - f'--illustration=illustration.png', - f'--language={language}', - f'--title={title}', - f'--description={description[:80]}', - f'--longDescription={long_description[:4096]}', - f'--name={zim_name}', - f'--creator={domain}', - '--publisher=RECON', - content_dir, - zim_path, - ] - - zim_log = os.path.join(workspace, 'zimwriterfs.log') - try: - with open(zim_log, 'w') as log_fh: - proc = subprocess.Popen( - zim_cmd, - stdout=log_fh, stderr=subprocess.STDOUT, - ) - db.update_scrape_job(job_id, subprocess_pid=proc.pid) - - while proc.poll() is None: - if stop_event.is_set() or _check_cancelled(db, job_id): - _kill_process(proc) - _handle_cancel(db, job_id, workspace, keep_workspace) - return - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - pass - - db.update_scrape_job(job_id, subprocess_pid=None) - - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return - - if proc.returncode != 0: - output = '' - try: - with open(zim_log, 'r') as f: - f.seek(max(0, os.path.getsize(zim_log) - 500)) - output = f.read() - except Exception: - pass - raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}") - - except RuntimeError: - raise - except Exception as e: - db.update_scrape_job(job_id, - status='failed', - error_message=f"zimwriterfs error: {e}", - subprocess_pid=None, - completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) - return - - if not os.path.isfile(zim_path): - db.update_scrape_job(job_id, - status='failed', - error_message='zimwriterfs produced no output file', - completed_at=_now()) - return - - logger.info(f"Job {job_id}: ZIM created at {zim_path}") - - # ── Phase D: kiwix-manage + registration ─────────────────────── - if stop_event.is_set() or _check_cancelled(db, job_id): - _handle_cancel(db, job_id, workspace, keep_workspace) - return + db.update_scrape_job(job_id, status='registering') + output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix') + zim_path = os.path.join(output_dir, zim_filename) kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage' library_xml = '/mnt/kiwix/library.xml' @@ -670,26 +345,32 @@ def _process_job(job, config, stop_event): except Exception as e: logger.warning(f"Job {job_id}: scan_zims failed: {e}") - try: - shutil.rmtree(workspace, ignore_errors=True) - except Exception: - pass - + # ── Phase 3: Complete ────────────────────────────────────────── db.update_scrape_job(job_id, status='complete', zim_filename=zim_filename, zim_source_id=zim_source_id, completed_at=_now()) - logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})") + logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)") -def _handle_cancel(db, job_id, workspace, keep_workspace): - """Handle job cancellation: clean up and update status.""" +def _handle_cancel(db, job_id): + """Handle job cancellation: clean up Docker container and update status.""" + container_name = f'recon-scraper-{job_id}' + try: + subprocess.run(['docker', 'rm', '-f', container_name], + capture_output=True, timeout=10) + except Exception: + pass + + # Clean up tmp dir if it exists + output_dir = '/mnt/kiwix' + tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}') + shutil.rmtree(tmp_dir, ignore_errors=True) + logger.info(f"Job {job_id}: cancelled") db.update_scrape_job(job_id, status='cancelled', subprocess_pid=None, completed_at=_now()) - if not keep_workspace: - shutil.rmtree(workspace, ignore_errors=True) diff --git a/static/js/scraper.js b/static/js/scraper.js index 49ce178..3988ffe 100644 --- a/static/js/scraper.js +++ b/static/js/scraper.js @@ -12,7 +12,7 @@ jobs.forEach(function(j) { if (j.status === 'complete') complete++; else if (j.status === 'failed' || j.status === 'cancelled') failed++; - else if (j.status === 'running' || j.status === 'pending') active++; + else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++; }); RECON.set('sc-total', RECON.fmt(total)); RECON.set('sc-active', RECON.fmt(active)); @@ -27,14 +27,12 @@ var html = ''; jobs.forEach(function(j) { var badge = statusBadge(j.status); - var mode = j.crawl_mode ? - '' + j.crawl_mode + '' : '\u2014'; var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014'; var zim = j.zim_filename ? '' + j.zim_filename + '' : '\u2014'; var actions = ''; - if (j.status === 'running' || j.status === 'pending') { + if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') { actions = ''; } else if (j.status === 'failed' || j.status === 'cancelled') { actions = ' ' + @@ -50,14 +48,13 @@ '' + j.id + '' + '' + escHtml(displayUrl) + '' + '' + escHtml(j.title || '\u2014') + '' + - '' + mode + '' + '' + pages + '' + '' + badge + errorTooltip(j) + '' + '' + zim + '' + '' + actions + '' + ''; }); - if (!html) html = 'No scrape jobs'; + if (!html) html = 'No scrape jobs'; RECON.setHTML('sc-table-body', html); }).catch(function(err) { console.error('Scraper dashboard error:', err); @@ -67,7 +64,8 @@ function statusBadge(status) { var map = { 'pending': 'PENDING', - 'running': 'RUNNING', + 'scraping': 'SCRAPING', + 'registering': 'REGISTERING', 'complete': 'COMPLETE', 'failed': 'FAILED', 'cancelled': 'CANCELLED' @@ -98,12 +96,9 @@ var title = document.getElementById('sf-title').value.trim(); var lang = document.getElementById('sf-lang').value; var category = document.getElementById('sf-category').value.trim(); - var mode = document.getElementById('sf-mode').value; - if (title) body.title = title; if (lang) body.language = lang; if (category) body.category = category; - if (mode) body.crawl_mode = mode; var btn = document.getElementById('sf-submit-btn'); var feedback = document.getElementById('sf-feedback'); diff --git a/templates/kiwix/scraper.html b/templates/kiwix/scraper.html index 3c42f43..862ba0a 100644 --- a/templates/kiwix/scraper.html +++ b/templates/kiwix/scraper.html @@ -17,7 +17,7 @@ style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;"> -
+
- - - - -
@@ -75,7 +66,6 @@ ID URL Title - Mode Pages Status ZIM @@ -83,7 +73,7 @@ - Loading... + Loading...