Replace wget/SingleFile/Playwright backends with Zimit

- Zimit Docker container handles all site types (static, SPA, JS redirects)
- Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic
- Added: _crawl_zimit() with Docker lifecycle management
- Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done
- No more zimwriterfs step — Zimit produces ZIM directly
- Dashboard UI simplified: removed crawl mode dropdown
- Config simplified: removed reject patterns, preflight, singlefile sections

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-19 14:06:23 +00:00
commit 8945c82e3f
5 changed files with 212 additions and 606 deletions

View file

@ -414,81 +414,12 @@ peertube:
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min) poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
scraper: scraper:
workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output)
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library) output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
rate_limit_delay: 0.5 # Seconds between wget requests (--wait)
wait_random: 1.0 # Random jitter added to wait (--random-wait range)
default_language: eng # ISO 639-3 language code for ZIM metadata default_language: eng # ISO 639-3 language code for ZIM metadata
user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
poll_interval: 300 # Seconds between checking for pending scrape jobs poll_interval: 300 # Seconds between checking for pending scrape jobs
keep_workspace_on_failure: true # Retain workspace for debugging when a job fails docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling
docker_workers: 2 # Concurrent crawl workers inside Zimit container
# Default URL patterns rejected by wget --reject-regex.
# Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
# Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
default_reject_patterns:
# WordPress
- '\?share='
- '\?replytocom='
- '\?like_comment='
- '/feed/'
- '/wp-json/'
- '/wp-login'
- '/wp-admin'
- '/wp-cron'
- '\?attachment_id='
- '/xmlrpc'
- '/trackback'
- '/comment-page-'
- '\?doing_wp_cron'
# Squarespace
- '\?format=json'
- '\?format=rss'
- '/api/'
# Wix
- '/_api/'
- '/_partials/'
# Ghost
- '/ghost/'
- '/p/'
# Drupal
- '\?q=comment'
- '\?q=node'
- '/user/login'
- '/user/register'
# General CMS / site chrome
- '/login'
- '/signup'
- '/register'
- '/cart'
- '/checkout'
- '/search\?'
- '/tag/'
- '/author/'
- '\?print='
- '\?pdf='
- '\?format=amp'
- '\?preview='
- '/rss'
- '/atom'
- '/cdn-cgi/'
# Pre-flight mode detection
preflight:
enabled: true
timeout: 30 # Seconds for single-page Playwright fetch
min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site
min_browser_size: 20480 # 20KB - browser HTML above this confirms JS
spa_markers:
- 'div#root'
- 'div#app'
- 'div#__next'
# SingleFile CLI settings (browser crawl mode)
singlefile:
executable: single-file
chromium_path: "/usr/bin/chromium-browser"
crawl_max_depth: 10
# Stream B: New Library Pipeline # Stream B: New Library Pipeline
new_pipeline: new_pipeline:

View file

@ -44,6 +44,20 @@ app = Flask(__name__,
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
# ── Large ZIM upload support ──
# Override stream factory so ZIM uploads write directly to /mnt/kiwix/
# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs).
from flask import Request as _FlaskRequest
class _LargeZimRequest(_FlaskRequest):
def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None):
if filename and filename.lower().endswith('.zim'):
return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False)
return super()._get_file_stream(total_content_length, content_type, filename, content_length)
app.request_class = _LargeZimRequest
# ── Navigation Constants ── # ── Navigation Constants ──
KNOWLEDGE_SUBNAV = [ KNOWLEDGE_SUBNAV = [
@ -2020,14 +2034,23 @@ def api_kiwix_upload():
filename = secure_filename(f.filename) filename = secure_filename(f.filename)
dest = os.path.join('/mnt/kiwix', filename) dest = os.path.join('/mnt/kiwix', filename)
tmp_dest = dest + '.tmp'
try: try:
f.save(tmp_dest) # Stream was written directly to /mnt/kiwix/ by _LargeZimRequest —
os.rename(tmp_dest, dest) # rename in-place instead of copying 100GB+ through f.save()
if hasattr(f.stream, 'name') and f.stream.name:
tmp_path = f.stream.name
f.stream.close()
os.rename(tmp_path, dest)
else:
tmp_dest = dest + '.tmp'
f.save(tmp_dest)
os.rename(tmp_dest, dest)
except Exception as e: except Exception as e:
if os.path.exists(tmp_dest): # Clean up any temp files on failure
os.remove(tmp_dest) for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]:
if p and os.path.exists(p):
os.remove(p)
return jsonify({'error': f'Save failed: {e}'}), 500 return jsonify({'error': f'Save failed: {e}'}), 500
# Register with kiwix-serve library # Register with kiwix-serve library
@ -2320,24 +2343,11 @@ def api_scraper_submit():
title = data.get('title', '').strip() or None title = data.get('title', '').strip() or None
category = data.get('category', '').strip() or None category = data.get('category', '').strip() or None
# Optional per-job reject pattern overrides
additional_reject_patterns = data.get('additional_reject_patterns')
skip_default_patterns = bool(data.get('skip_default_patterns', False))
# Optional crawl mode override (static, browser, redirect, or null for auto-detect)
crawl_mode = data.get('crawl_mode')
if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'):
return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400
# Serialize additional patterns as JSON if provided
import json as _json
additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None
db = StatusDB() db = StatusDB()
conn = db._get_conn() conn = db._get_conn()
conn.execute( conn.execute(
"INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)", "INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)",
(url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode) (url, title, language, category, 'zimit')
) )
conn.commit() conn.commit()
job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0] job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
@ -2358,8 +2368,6 @@ def api_scraper_jobs():
@app.route('/api/scraper/cancel/<int:job_id>', methods=['POST']) @app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
def api_scraper_cancel(job_id): def api_scraper_cancel(job_id):
"""Cancel a scrape job.""" """Cancel a scrape job."""
import os as _os
import signal as _signal
db = StatusDB() db = StatusDB()
job = db.get_scrape_job(job_id) job = db.get_scrape_job(job_id)
@ -2372,13 +2380,14 @@ def api_scraper_cancel(job_id):
# Set cancelled in DB — the runner loop checks this between phases # Set cancelled in DB — the runner loop checks this between phases
db.update_scrape_job(job_id, status='cancelled') db.update_scrape_job(job_id, status='cancelled')
# If there's an active subprocess, send SIGTERM # Stop the Docker container if running
pid = job.get('subprocess_pid') container_name = f'recon-scraper-{job_id}'
if pid: try:
try: import subprocess as _subprocess
_os.kill(pid, _signal.SIGTERM) _subprocess.run(['docker', 'rm', '-f', container_name],
except (ProcessLookupError, PermissionError): capture_output=True, timeout=10)
pass # Process already gone except Exception:
pass
logger.info(f"Scraper job {job_id} cancelled") logger.info(f"Scraper job {job_id} cancelled")
return jsonify({'ok': True}) return jsonify({'ok': True})

View file

@ -1,27 +1,21 @@
""" """
RECON Scraper Runner RECON Scraper Runner
Daemon loop that processes scrape jobs: crawl zimwriterfs kiwix-manage. Daemon loop that processes scrape jobs: crawl via Zimit kiwix-manage.
Supports two crawl backends: Zimit (openZIM Docker crawler) handles all site types and produces ZIM
- wget (static sites) default files directly no separate zimwriterfs step needed.
- SingleFile CLI (JS-rendered sites) browser mode
Pre-flight detection automatically chooses the right backend unless
crawl_mode is pre-set on the job.
Public entry point: scraper_loop(stop_event, config). Public entry point: scraper_loop(stop_event, config).
Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile) Config section: scraper (output_dir, docker_image, docker_workers, poll_interval)
DB table: scrape_jobs (status flow: pending scraping packaging complete) DB table: scrape_jobs (status flow: pending scraping registering complete)
""" """
import glob as _glob import glob as _glob
import json as _json
import os import os
import re import re
import shutil import shutil
import signal import signal
import subprocess import subprocess
import tempfile
import time import time
from datetime import datetime, timezone from datetime import datetime, timezone
from urllib.parse import urlparse from urllib.parse import urlparse
@ -39,6 +33,9 @@ def scraper_loop(stop_event, config):
logger.info("Scraper runner started") logger.info("Scraper runner started")
# Clean up any orphan Zimit containers from a previous crash
_cleanup_orphan_containers()
while not stop_event.is_set(): while not stop_event.is_set():
db = StatusDB() db = StatusDB()
job = db.get_pending_scrape_job() job = db.get_pending_scrape_job()
@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5):
proc.wait(timeout=2) proc.wait(timeout=2)
def _count_html_files(directory): def _cleanup_orphan_containers():
"""Count HTML files in a directory tree.""" """Remove any leftover recon-scraper-* Docker containers from a previous crash."""
count = 0
for root, dirs, files in os.walk(directory):
for f in files:
if f.lower().endswith(('.html', '.htm')):
count += 1
return count
def _find_welcome_page(content_dir, domain):
"""Find the welcome page (index.html) in the wget mirror."""
domain_dir = None
for entry in os.listdir(content_dir):
entry_path = os.path.join(content_dir, entry)
if os.path.isdir(entry_path):
domain_dir = entry_path
break
if not domain_dir:
return None, content_dir
for candidate in ['index.html', 'index.htm']:
path = os.path.join(domain_dir, candidate)
if os.path.isfile(path):
return candidate, domain_dir
for root, dirs, files in os.walk(domain_dir):
for f in sorted(files):
if f.lower().endswith(('.html', '.htm')):
rel = os.path.relpath(os.path.join(root, f), domain_dir)
return rel, domain_dir
return 'index.html', domain_dir
def _create_placeholder_illustration(path):
"""Create a 48x48 placeholder PNG for zimwriterfs --illustration."""
from PIL import Image
img = Image.new('RGB', (48, 48), color=(40, 192, 232))
img.save(path, 'PNG')
# ── Crawl mode detection ──────────────────────────────────────────
def _get_chromium_path(config):
"""Auto-detect Chromium from Playwright's cache, or use config override."""
configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '')
if configured and os.path.isfile(configured):
return configured
# Playwright stores Chromium — check both root and user caches
search_paths = [
os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'),
'/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome',
]
for pattern in search_paths:
matches = sorted(_glob.glob(pattern))
if matches:
return matches[-1]
return None
def _detect_crawl_mode(url, config):
"""
Pre-flight detection: determine whether a URL needs a browser to crawl.
Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'.
'redirect' means the URL redirected to a different domain (parking page etc.);
resolved_url will be the final browser URL in that case.
"""
preflight_cfg = config.get('scraper', {}).get('preflight', {})
if not preflight_cfg.get('enabled', True):
return 'static', url
timeout = preflight_cfg.get('timeout', 30)
min_static = preflight_cfg.get('min_static_size', 5120)
min_browser = preflight_cfg.get('min_browser_size', 20480)
spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next'])
input_domain = urlparse(url).hostname or ''
if input_domain.startswith('www.'):
input_domain = input_domain[4:]
# Step 1: wget single-page fetch
wget_html = ''
wget_size = 0
try: try:
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp:
tmp_path = tmp.name
result = subprocess.run( result = subprocess.run(
['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url], ['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'],
capture_output=True, text=True, timeout=timeout + 5 capture_output=True, text=True, timeout=10
) )
if os.path.isfile(tmp_path): if result.returncode == 0 and result.stdout.strip():
wget_size = os.path.getsize(tmp_path) for name in result.stdout.strip().split('\n'):
with open(tmp_path, 'r', errors='replace') as f: name = name.strip()
wget_html = f.read() if name:
os.unlink(tmp_path) subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10)
logger.info(f"Cleaned up orphan container: {name}")
except Exception as e: except Exception as e:
logger.debug(f"Preflight wget failed for {url}: {e}") logger.warning(f"Orphan container cleanup failed: {e}")
try:
os.unlink(tmp_path)
except Exception:
pass
# Step 2: Playwright headless fetch
browser_html = ''
browser_size = 0
browser_url = url
try:
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(
headless=True,
args=['--no-sandbox', '--disable-dev-shm-usage']
)
page = browser.new_page()
page.goto(url, wait_until='networkidle', timeout=timeout * 1000)
browser_url = page.url
browser_html = page.content()
browser_size = len(browser_html.encode('utf-8'))
browser.close()
except Exception as e:
logger.debug(f"Preflight Playwright failed for {url}: {e}")
# If Playwright fails entirely, fall back to static
return 'static', url
# Step 3: Decision logic
browser_domain = urlparse(browser_url).hostname or ''
if browser_domain.startswith('www.'):
browser_domain = browser_domain[4:]
# Check for cross-domain redirect (parking page detection)
if browser_domain and input_domain and browser_domain != input_domain:
logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect")
return 'redirect', browser_url
# Check size disparity: small wget + large browser = JS-rendered
if wget_size < min_static and browser_size > min_browser:
logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser")
return 'browser', url
# Check for SPA shell markers in wget HTML
if wget_html:
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(wget_html, 'html.parser')
for marker in spa_markers:
# marker is like 'div#root' — split tag and id
parts = marker.split('#', 1)
tag = parts[0] if parts[0] else 'div'
elem_id = parts[1] if len(parts) > 1 else None
elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag)
if elem:
text_content = elem.get_text(strip=True)
if len(text_content) < 100:
logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser")
return 'browser', url
except Exception as e:
logger.debug(f"Preflight SPA marker check failed: {e}")
logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static")
return 'static', url
# ── Crawl backends ──────────────────────────────────────────────── # ── Zimit crawl backend ──────────────────────────────────────────
def _crawl_wget(job, url, site_dir, config, stop_event, db): def _crawl_zimit(job, config, stop_event, db):
""" """
wget mirror crawl backend. Crawl a URL using Zimit (openZIM Docker crawler).
Returns (page_count, error_msg) error_msg is None on success, 'cancelled' on cancel.
Returns (page_count, zim_filename, error_msg).
On success: (count, filename, None)
On failure: (0, None, error_string)
""" """
job_id = job['id'] job_id = job['id']
url = job['url']
title = job.get('title') or _sanitize_domain(url)
language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
category = job.get('category') or ''
scraper_cfg = config.get('scraper', {}) scraper_cfg = config.get('scraper', {})
rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5) output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)') docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit')
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True) docker_workers = scraper_cfg.get('docker_workers', 2)
workspace = os.path.dirname(site_dir)
# Build reject-regex from config defaults + per-job overrides
reject_patterns = []
skip_defaults = bool(job.get('skip_default_patterns'))
if not skip_defaults:
reject_patterns.extend(scraper_cfg.get('default_reject_patterns', []))
additional_raw = job.get('additional_reject_patterns')
if additional_raw:
try:
additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw
if isinstance(additional, list):
reject_patterns.extend(additional)
except (ValueError, TypeError):
pass
wget_cmd = [
'wget', '--mirror', '--convert-links', '--adjust-extension',
'--page-requisites', '--no-parent',
'--restrict-file-names=windows',
f'--wait={rate_limit_delay}', '--random-wait',
f'--user-agent={user_agent}',
f'--directory-prefix={site_dir}',
'--timeout=30', '--tries=3',
]
if reject_patterns:
combined_regex = '|'.join(f'({p})' for p in reject_patterns)
wget_cmd.extend([f'--reject-regex={combined_regex}'])
logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns")
wget_cmd.append(url)
logger.info(f"Job {job_id}: wget mirror starting")
wget_log = os.path.join(workspace, 'wget.log')
try:
with open(wget_log, 'w') as log_fh:
proc = subprocess.Popen(
wget_cmd,
stdout=log_fh, stderr=subprocess.STDOUT,
)
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
while proc.poll() is None:
if stop_event.is_set() or _check_cancelled(db, job_id):
_kill_process(proc)
return 0, 'cancelled'
try:
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
pass
db.update_scrape_job(job_id, subprocess_pid=None)
if stop_event.is_set() or _check_cancelled(db, job_id):
return 0, 'cancelled'
# wget returns 8 for some server errors but may still have useful content
if proc.returncode not in (0, 4, 6, 8):
output = ''
try:
with open(wget_log, 'r') as f:
f.seek(max(0, os.path.getsize(wget_log) - 500))
output = f.read()
except Exception:
pass
return 0, f"wget failed with code {proc.returncode}: {output[-500:]}"
except Exception as e:
return 0, f"wget error: {e}"
page_count = _count_html_files(site_dir)
logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found")
if page_count == 0:
return 0, 'wget produced no HTML files'
return page_count, None
def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
"""
SingleFile CLI crawl backend for JS-rendered sites.
Returns (page_count, error_msg) error_msg is None on success, 'cancelled' on cancel.
"""
job_id = job['id']
scraper_cfg = config.get('scraper', {})
sf_cfg = scraper_cfg.get('singlefile', {})
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
workspace = os.path.dirname(site_dir)
executable = sf_cfg.get('executable', 'single-file')
chromium_path = _get_chromium_path(config)
crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
if not chromium_path:
return 0, 'Chromium not found — cannot use browser crawl mode'
# SingleFile outputs into site_dir/<domain>/ to match wget's structure
domain = _sanitize_domain(url) domain = _sanitize_domain(url)
output_dir = os.path.join(site_dir, domain) date_tag = datetime.now().strftime('%Y-%m')
os.makedirs(output_dir, exist_ok=True) container_name = f'recon-scraper-{job_id}'
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
sf_cmd = [ # Clean up any pre-existing container with same name (retry scenario)
executable, subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10)
'--crawl-links=true',
'--crawl-inner-links-only=true', os.makedirs(tmp_dir, exist_ok=True)
'--crawl-no-parent=true',
'--crawl-replace-URLs=true', description = f"Mirror of {domain}"
f'--crawl-max-depth={crawl_max_depth}', if category:
f'--browser-executable-path={chromium_path}', description = f"{category} — mirror of {domain}"
'--browser-headless=true',
'--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', docker_cmd = [
f'--output-directory={output_dir}', 'docker', 'run', '--rm',
url, '--name', container_name,
'-v', f'{tmp_dir}:/output',
docker_image,
'--url', url,
'--name', _sanitize_filename(domain),
'--lang', language,
'--title', title,
'--description', description[:80],
'--output', '/output',
'--workers', str(docker_workers),
] ]
logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})") logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
sf_log = os.path.join(workspace, 'singlefile.log')
try: try:
with open(sf_log, 'w') as log_fh: proc = subprocess.Popen(
proc = subprocess.Popen( docker_cmd,
sf_cmd, stdout=subprocess.DEVNULL,
stdout=log_fh, stderr=subprocess.STDOUT, stderr=subprocess.DEVNULL,
) )
db.update_scrape_job(job_id, subprocess_pid=proc.pid) db.update_scrape_job(job_id, subprocess_pid=proc.pid)
last_progress_check = 0
while proc.poll() is None: while proc.poll() is None:
if stop_event.is_set() or _check_cancelled(db, job_id): if stop_event.is_set() or _check_cancelled(db, job_id):
# Stop the Docker container
subprocess.run(['docker', 'rm', '-f', container_name],
capture_output=True, timeout=10)
_kill_process(proc) _kill_process(proc)
return 0, 'cancelled' shutil.rmtree(tmp_dir, ignore_errors=True)
return 0, None, 'cancelled'
# Check progress every 30s via docker logs
now = time.time()
if now - last_progress_check >= 30:
last_progress_check = now
try:
log_result = subprocess.run(
['docker', 'logs', '--tail', '20', container_name],
capture_output=True, text=True, timeout=10
)
if log_result.returncode == 0 and log_result.stderr:
# Zimit/Browsertrix logs page counts — look for numbers
lines = log_result.stderr.strip().split('\n')
for line in reversed(lines):
# Look for patterns like "X pages" or page count indicators
match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
if match:
count = int(match.group(1))
if count > 0:
db.update_scrape_job(job_id, page_count=count)
break
except Exception:
pass
try: try:
proc.wait(timeout=5) proc.wait(timeout=5)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
db.update_scrape_job(job_id, subprocess_pid=None) db.update_scrape_job(job_id, subprocess_pid=None)
if stop_event.is_set() or _check_cancelled(db, job_id): if stop_event.is_set() or _check_cancelled(db, job_id):
return 0, 'cancelled' shutil.rmtree(tmp_dir, ignore_errors=True)
return 0, None, 'cancelled'
if proc.returncode != 0: if proc.returncode != 0:
output = '' # Capture last 50 lines of docker logs for error context
error_msg = f"Zimit exited with code {proc.returncode}"
try: try:
with open(sf_log, 'r') as f: log_result = subprocess.run(
f.seek(max(0, os.path.getsize(sf_log) - 500)) ['docker', 'logs', '--tail', '50', container_name],
output = f.read() capture_output=True, text=True, timeout=10
)
log_text = (log_result.stderr or log_result.stdout or '').strip()
if log_text:
# Take last 500 chars
error_msg += f": {log_text[-500:]}"
except Exception: except Exception:
pass pass
# SingleFile may still produce some files even with non-zero exit shutil.rmtree(tmp_dir, ignore_errors=True)
page_count = _count_html_files(site_dir) return 0, None, error_msg
if page_count == 0:
return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}"
logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing")
except Exception as e: except Exception as e:
return 0, f"SingleFile error: {e}" shutil.rmtree(tmp_dir, ignore_errors=True)
return 0, None, f"Zimit error: {e}"
# If no index.html exists, rename the first HTML file to index.html # Find the output ZIM file
index_path = os.path.join(output_dir, 'index.html') zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
if not os.path.isfile(index_path): if not zim_files:
for f in sorted(os.listdir(output_dir)): shutil.rmtree(tmp_dir, ignore_errors=True)
if f.lower().endswith(('.html', '.htm')): return 0, None, 'Zimit produced no ZIM file'
src = os.path.join(output_dir, f)
os.rename(src, index_path)
logger.info(f"Job {job_id}: renamed {f} → index.html")
break
page_count = _count_html_files(site_dir) src_zim = zim_files[0] # Should be exactly one
logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found")
if page_count == 0: # Get page count from file size as rough estimate if we don't have one
return 0, 'SingleFile produced no HTML files' page_count = 0
try:
job_state = db.get_scrape_job(job_id)
page_count = job_state.get('page_count') or 0
except Exception:
pass
return page_count, None # Rename to final location
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
zim_path = os.path.join(output_dir, zim_filename)
try:
shutil.move(src_zim, zim_path)
except Exception as e:
shutil.rmtree(tmp_dir, ignore_errors=True)
return 0, None, f"Failed to move ZIM to output dir: {e}"
shutil.rmtree(tmp_dir, ignore_errors=True)
logger.info(f"Job {job_id}: Zimit complete — {zim_filename}")
return page_count, zim_filename, None
# ── Main job pipeline ───────────────────────────────────────────── # ── Main job pipeline ─────────────────────────────────────────────
@ -458,183 +273,43 @@ def _process_job(job, config, stop_event):
"""Execute the full scrape pipeline for a single job.""" """Execute the full scrape pipeline for a single job."""
db = StatusDB() db = StatusDB()
job_id = job['id'] job_id = job['id']
url = job['url']
title = job.get('title') or _sanitize_domain(url)
language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
category = job.get('category') or ''
scraper_cfg = config.get('scraper', {}) logger.info(f"Job {job_id}: starting scrape of {job['url']}")
workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper')
output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
workspace = os.path.join(workspace_root, str(job_id)) # ── Phase 1: Crawl via Zimit ───────────────────────────────────
site_dir = os.path.join(workspace, 'site')
os.makedirs(site_dir, exist_ok=True)
domain = _sanitize_domain(url)
date_tag = datetime.now().strftime('%Y-%m')
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
zim_path = os.path.join(output_dir, zim_filename)
logger.info(f"Job {job_id}: starting scrape of {url}")
db.update_scrape_job(job_id, db.update_scrape_job(job_id,
status='scraping', status='scraping',
workspace_path=workspace, crawl_mode='zimit',
started_at=_now()) started_at=_now())
# ── Phase 0: Pre-flight mode detection ─────────────────────────
if stop_event.is_set() or _check_cancelled(db, job_id): if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace) _handle_cancel(db, job_id)
return return
pre_set = job.get('crawl_mode') page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db)
if pre_set:
crawl_mode, resolved_url = pre_set, url
logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}")
else:
crawl_mode, resolved_url = _detect_crawl_mode(url, config)
logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}")
db.update_scrape_job(job_id, crawl_mode=crawl_mode)
# If redirect detected, update domain/filename to match resolved URL
if crawl_mode == 'redirect' and resolved_url != url:
logger.info(f"Job {job_id}: URL resolved from {url}{resolved_url}")
domain = _sanitize_domain(resolved_url)
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
zim_path = os.path.join(output_dir, zim_filename)
# ── Phase A: Crawl (dispatch to backend) ────────────────────────
if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace)
return
if crawl_mode == 'browser':
page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db)
else: # 'static' or 'redirect'
page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db)
if error == 'cancelled': if error == 'cancelled':
_handle_cancel(db, job_id, workspace, keep_workspace) _handle_cancel(db, job_id)
return return
elif error: elif error:
db.update_scrape_job(job_id, db.update_scrape_job(job_id,
status='failed', status='failed',
error_message=error, error_message=error[:1000],
subprocess_pid=None, subprocess_pid=None,
completed_at=_now()) completed_at=_now())
if not keep_workspace:
shutil.rmtree(workspace, ignore_errors=True)
return return
db.update_scrape_job(job_id, page_count=page_count) db.update_scrape_job(job_id, page_count=page_count)
# ── Phase B: Prepare zimwriterfs inputs ──────────────────────── # ── Phase 2: Register with kiwix-serve ─────────────────────────
if stop_event.is_set() or _check_cancelled(db, job_id): if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace) _handle_cancel(db, job_id)
return return
welcome_page, content_dir = _find_welcome_page(site_dir, domain) db.update_scrape_job(job_id, status='registering')
if welcome_page is None:
welcome_page = 'index.html'
illustration_path = os.path.join(workspace, 'illustration.png')
_create_placeholder_illustration(illustration_path)
illust_dest = os.path.join(content_dir, 'illustration.png')
shutil.copy2(illustration_path, illust_dest)
description = f"Mirror of {domain}"
if category:
description = f"{category} — mirror of {domain}"
logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})")
db.update_scrape_job(job_id, status='packaging')
# ── Phase C: zimwriterfs ───────────────────────────────────────
if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace)
return
zim_name = _sanitize_filename(domain)
long_description = f"Offline mirror of {resolved_url} created by RECON web scraper"
zim_cmd = [
'zimwriterfs',
f'--welcome={welcome_page}',
f'--illustration=illustration.png',
f'--language={language}',
f'--title={title}',
f'--description={description[:80]}',
f'--longDescription={long_description[:4096]}',
f'--name={zim_name}',
f'--creator={domain}',
'--publisher=RECON',
content_dir,
zim_path,
]
zim_log = os.path.join(workspace, 'zimwriterfs.log')
try:
with open(zim_log, 'w') as log_fh:
proc = subprocess.Popen(
zim_cmd,
stdout=log_fh, stderr=subprocess.STDOUT,
)
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
while proc.poll() is None:
if stop_event.is_set() or _check_cancelled(db, job_id):
_kill_process(proc)
_handle_cancel(db, job_id, workspace, keep_workspace)
return
try:
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
pass
db.update_scrape_job(job_id, subprocess_pid=None)
if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace)
return
if proc.returncode != 0:
output = ''
try:
with open(zim_log, 'r') as f:
f.seek(max(0, os.path.getsize(zim_log) - 500))
output = f.read()
except Exception:
pass
raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}")
except RuntimeError:
raise
except Exception as e:
db.update_scrape_job(job_id,
status='failed',
error_message=f"zimwriterfs error: {e}",
subprocess_pid=None,
completed_at=_now())
if not keep_workspace:
shutil.rmtree(workspace, ignore_errors=True)
return
if not os.path.isfile(zim_path):
db.update_scrape_job(job_id,
status='failed',
error_message='zimwriterfs produced no output file',
completed_at=_now())
return
logger.info(f"Job {job_id}: ZIM created at {zim_path}")
# ── Phase D: kiwix-manage + registration ───────────────────────
if stop_event.is_set() or _check_cancelled(db, job_id):
_handle_cancel(db, job_id, workspace, keep_workspace)
return
output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix')
zim_path = os.path.join(output_dir, zim_filename)
kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage' kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
library_xml = '/mnt/kiwix/library.xml' library_xml = '/mnt/kiwix/library.xml'
@ -670,26 +345,32 @@ def _process_job(job, config, stop_event):
except Exception as e: except Exception as e:
logger.warning(f"Job {job_id}: scan_zims failed: {e}") logger.warning(f"Job {job_id}: scan_zims failed: {e}")
try: # ── Phase 3: Complete ──────────────────────────────────────────
shutil.rmtree(workspace, ignore_errors=True)
except Exception:
pass
db.update_scrape_job(job_id, db.update_scrape_job(job_id,
status='complete', status='complete',
zim_filename=zim_filename, zim_filename=zim_filename,
zim_source_id=zim_source_id, zim_source_id=zim_source_id,
completed_at=_now()) completed_at=_now())
logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})") logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)")
def _handle_cancel(db, job_id, workspace, keep_workspace): def _handle_cancel(db, job_id):
"""Handle job cancellation: clean up and update status.""" """Handle job cancellation: clean up Docker container and update status."""
container_name = f'recon-scraper-{job_id}'
try:
subprocess.run(['docker', 'rm', '-f', container_name],
capture_output=True, timeout=10)
except Exception:
pass
# Clean up tmp dir if it exists
output_dir = '/mnt/kiwix'
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
shutil.rmtree(tmp_dir, ignore_errors=True)
logger.info(f"Job {job_id}: cancelled") logger.info(f"Job {job_id}: cancelled")
db.update_scrape_job(job_id, db.update_scrape_job(job_id,
status='cancelled', status='cancelled',
subprocess_pid=None, subprocess_pid=None,
completed_at=_now()) completed_at=_now())
if not keep_workspace:
shutil.rmtree(workspace, ignore_errors=True)

View file

@ -12,7 +12,7 @@
jobs.forEach(function(j) { jobs.forEach(function(j) {
if (j.status === 'complete') complete++; if (j.status === 'complete') complete++;
else if (j.status === 'failed' || j.status === 'cancelled') failed++; else if (j.status === 'failed' || j.status === 'cancelled') failed++;
else if (j.status === 'running' || j.status === 'pending') active++; else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++;
}); });
RECON.set('sc-total', RECON.fmt(total)); RECON.set('sc-total', RECON.fmt(total));
RECON.set('sc-active', RECON.fmt(active)); RECON.set('sc-active', RECON.fmt(active));
@ -27,14 +27,12 @@
var html = ''; var html = '';
jobs.forEach(function(j) { jobs.forEach(function(j) {
var badge = statusBadge(j.status); var badge = statusBadge(j.status);
var mode = j.crawl_mode ?
'<span class="text-small">' + j.crawl_mode + '</span>' : '<span class="text-muted">\u2014</span>';
var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014'; var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
var zim = j.zim_filename ? var zim = j.zim_filename ?
'<span class="text-small">' + j.zim_filename + '</span>' : '\u2014'; '<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
var actions = ''; var actions = '';
if (j.status === 'running' || j.status === 'pending') { if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') {
actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>'; actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
} else if (j.status === 'failed' || j.status === 'cancelled') { } else if (j.status === 'failed' || j.status === 'cancelled') {
actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' + actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
@ -50,14 +48,13 @@
'<td>' + j.id + '</td>' + '<td>' + j.id + '</td>' +
'<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' + '<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
'<td>' + escHtml(j.title || '\u2014') + '</td>' + '<td>' + escHtml(j.title || '\u2014') + '</td>' +
'<td>' + mode + '</td>' +
'<td>' + pages + '</td>' + '<td>' + pages + '</td>' +
'<td>' + badge + errorTooltip(j) + '</td>' + '<td>' + badge + errorTooltip(j) + '</td>' +
'<td>' + zim + '</td>' + '<td>' + zim + '</td>' +
'<td>' + actions + '</td>' + '<td>' + actions + '</td>' +
'</tr>'; '</tr>';
}); });
if (!html) html = '<tr><td colspan="8" class="text-muted">No scrape jobs</td></tr>'; if (!html) html = '<tr><td colspan="7" class="text-muted">No scrape jobs</td></tr>';
RECON.setHTML('sc-table-body', html); RECON.setHTML('sc-table-body', html);
}).catch(function(err) { }).catch(function(err) {
console.error('Scraper dashboard error:', err); console.error('Scraper dashboard error:', err);
@ -67,7 +64,8 @@
function statusBadge(status) { function statusBadge(status) {
var map = { var map = {
'pending': '<span class="badge-detected">PENDING</span>', 'pending': '<span class="badge-detected">PENDING</span>',
'running': '<span class="badge-processing">RUNNING</span>', 'scraping': '<span class="badge-processing">SCRAPING</span>',
'registering': '<span class="badge-processing">REGISTERING</span>',
'complete': '<span class="badge-complete">COMPLETE</span>', 'complete': '<span class="badge-complete">COMPLETE</span>',
'failed': '<span class="badge-failed">FAILED</span>', 'failed': '<span class="badge-failed">FAILED</span>',
'cancelled': '<span class="badge-detected">CANCELLED</span>' 'cancelled': '<span class="badge-detected">CANCELLED</span>'
@ -98,12 +96,9 @@
var title = document.getElementById('sf-title').value.trim(); var title = document.getElementById('sf-title').value.trim();
var lang = document.getElementById('sf-lang').value; var lang = document.getElementById('sf-lang').value;
var category = document.getElementById('sf-category').value.trim(); var category = document.getElementById('sf-category').value.trim();
var mode = document.getElementById('sf-mode').value;
if (title) body.title = title; if (title) body.title = title;
if (lang) body.language = lang; if (lang) body.language = lang;
if (category) body.category = category; if (category) body.category = category;
if (mode) body.crawl_mode = mode;
var btn = document.getElementById('sf-submit-btn'); var btn = document.getElementById('sf-submit-btn');
var feedback = document.getElementById('sf-feedback'); var feedback = document.getElementById('sf-feedback');

View file

@ -17,7 +17,7 @@
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;"> style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
</div> </div>
</div> </div>
<div style="display:grid;grid-template-columns:1fr 1fr 1fr auto;gap:12px;align-items:end;"> <div style="display:grid;grid-template-columns:1fr 1fr auto;gap:12px;align-items:end;">
<div> <div>
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label> <label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
<select id="sf-lang" <select id="sf-lang"
@ -38,15 +38,6 @@
<input type="text" id="sf-category" placeholder="Optional" <input type="text" id="sf-category" placeholder="Optional"
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;"> style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
</div> </div>
<div>
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Crawl Mode</label>
<select id="sf-mode"
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
<option value="" selected>Auto-detect</option>
<option value="static">Static (wget)</option>
<option value="browser">Browser (SingleFile)</option>
</select>
</div>
<div> <div>
<button type="submit" class="btn" id="sf-submit-btn">Submit</button> <button type="submit" class="btn" id="sf-submit-btn">Submit</button>
</div> </div>
@ -75,7 +66,6 @@
<th>ID</th> <th>ID</th>
<th>URL</th> <th>URL</th>
<th>Title</th> <th>Title</th>
<th>Mode</th>
<th>Pages</th> <th>Pages</th>
<th>Status</th> <th>Status</th>
<th>ZIM</th> <th>ZIM</th>
@ -83,7 +73,7 @@
</tr> </tr>
</thead> </thead>
<tbody id="sc-table-body"> <tbody id="sc-table-body">
<tr><td colspan="8" class="text-muted">Loading...</td></tr> <tr><td colspan="7" class="text-muted">Loading...</td></tr>
</tbody> </tbody>
</table> </table>
</div> </div>