mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Replace wget/SingleFile/Playwright backends with Zimit
- Zimit Docker container handles all site types (static, SPA, JS redirects) - Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic - Added: _crawl_zimit() with Docker lifecycle management - Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done - No more zimwriterfs step — Zimit produces ZIM directly - Dashboard UI simplified: removed crawl mode dropdown - Config simplified: removed reject patterns, preflight, singlefile sections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f0b160ef7c
commit
8945c82e3f
5 changed files with 212 additions and 606 deletions
75
config.yaml
75
config.yaml
|
|
@ -414,81 +414,12 @@ peertube:
|
||||||
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
||||||
|
|
||||||
scraper:
|
scraper:
|
||||||
workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds
|
workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output)
|
||||||
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
|
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
|
||||||
rate_limit_delay: 0.5 # Seconds between wget requests (--wait)
|
|
||||||
wait_random: 1.0 # Random jitter added to wait (--random-wait range)
|
|
||||||
default_language: eng # ISO 639-3 language code for ZIM metadata
|
default_language: eng # ISO 639-3 language code for ZIM metadata
|
||||||
user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
|
|
||||||
poll_interval: 300 # Seconds between checking for pending scrape jobs
|
poll_interval: 300 # Seconds between checking for pending scrape jobs
|
||||||
keep_workspace_on_failure: true # Retain workspace for debugging when a job fails
|
docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling
|
||||||
|
docker_workers: 2 # Concurrent crawl workers inside Zimit container
|
||||||
# Default URL patterns rejected by wget --reject-regex.
|
|
||||||
# Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
|
|
||||||
# Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
|
|
||||||
default_reject_patterns:
|
|
||||||
# WordPress
|
|
||||||
- '\?share='
|
|
||||||
- '\?replytocom='
|
|
||||||
- '\?like_comment='
|
|
||||||
- '/feed/'
|
|
||||||
- '/wp-json/'
|
|
||||||
- '/wp-login'
|
|
||||||
- '/wp-admin'
|
|
||||||
- '/wp-cron'
|
|
||||||
- '\?attachment_id='
|
|
||||||
- '/xmlrpc'
|
|
||||||
- '/trackback'
|
|
||||||
- '/comment-page-'
|
|
||||||
- '\?doing_wp_cron'
|
|
||||||
# Squarespace
|
|
||||||
- '\?format=json'
|
|
||||||
- '\?format=rss'
|
|
||||||
- '/api/'
|
|
||||||
# Wix
|
|
||||||
- '/_api/'
|
|
||||||
- '/_partials/'
|
|
||||||
# Ghost
|
|
||||||
- '/ghost/'
|
|
||||||
- '/p/'
|
|
||||||
# Drupal
|
|
||||||
- '\?q=comment'
|
|
||||||
- '\?q=node'
|
|
||||||
- '/user/login'
|
|
||||||
- '/user/register'
|
|
||||||
# General CMS / site chrome
|
|
||||||
- '/login'
|
|
||||||
- '/signup'
|
|
||||||
- '/register'
|
|
||||||
- '/cart'
|
|
||||||
- '/checkout'
|
|
||||||
- '/search\?'
|
|
||||||
- '/tag/'
|
|
||||||
- '/author/'
|
|
||||||
- '\?print='
|
|
||||||
- '\?pdf='
|
|
||||||
- '\?format=amp'
|
|
||||||
- '\?preview='
|
|
||||||
- '/rss'
|
|
||||||
- '/atom'
|
|
||||||
- '/cdn-cgi/'
|
|
||||||
|
|
||||||
# Pre-flight mode detection
|
|
||||||
preflight:
|
|
||||||
enabled: true
|
|
||||||
timeout: 30 # Seconds for single-page Playwright fetch
|
|
||||||
min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site
|
|
||||||
min_browser_size: 20480 # 20KB - browser HTML above this confirms JS
|
|
||||||
spa_markers:
|
|
||||||
- 'div#root'
|
|
||||||
- 'div#app'
|
|
||||||
- 'div#__next'
|
|
||||||
|
|
||||||
# SingleFile CLI settings (browser crawl mode)
|
|
||||||
singlefile:
|
|
||||||
executable: single-file
|
|
||||||
chromium_path: "/usr/bin/chromium-browser"
|
|
||||||
crawl_max_depth: 10
|
|
||||||
|
|
||||||
# Stream B: New Library Pipeline
|
# Stream B: New Library Pipeline
|
||||||
new_pipeline:
|
new_pipeline:
|
||||||
|
|
|
||||||
61
lib/api.py
61
lib/api.py
|
|
@ -44,6 +44,20 @@ app = Flask(__name__,
|
||||||
|
|
||||||
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
|
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
|
||||||
|
|
||||||
|
|
||||||
|
# ── Large ZIM upload support ──
|
||||||
|
# Override stream factory so ZIM uploads write directly to /mnt/kiwix/
|
||||||
|
# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs).
|
||||||
|
from flask import Request as _FlaskRequest
|
||||||
|
|
||||||
|
class _LargeZimRequest(_FlaskRequest):
|
||||||
|
def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None):
|
||||||
|
if filename and filename.lower().endswith('.zim'):
|
||||||
|
return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False)
|
||||||
|
return super()._get_file_stream(total_content_length, content_type, filename, content_length)
|
||||||
|
|
||||||
|
app.request_class = _LargeZimRequest
|
||||||
|
|
||||||
# ── Navigation Constants ──
|
# ── Navigation Constants ──
|
||||||
|
|
||||||
KNOWLEDGE_SUBNAV = [
|
KNOWLEDGE_SUBNAV = [
|
||||||
|
|
@ -2020,14 +2034,23 @@ def api_kiwix_upload():
|
||||||
|
|
||||||
filename = secure_filename(f.filename)
|
filename = secure_filename(f.filename)
|
||||||
dest = os.path.join('/mnt/kiwix', filename)
|
dest = os.path.join('/mnt/kiwix', filename)
|
||||||
tmp_dest = dest + '.tmp'
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Stream was written directly to /mnt/kiwix/ by _LargeZimRequest —
|
||||||
|
# rename in-place instead of copying 100GB+ through f.save()
|
||||||
|
if hasattr(f.stream, 'name') and f.stream.name:
|
||||||
|
tmp_path = f.stream.name
|
||||||
|
f.stream.close()
|
||||||
|
os.rename(tmp_path, dest)
|
||||||
|
else:
|
||||||
|
tmp_dest = dest + '.tmp'
|
||||||
f.save(tmp_dest)
|
f.save(tmp_dest)
|
||||||
os.rename(tmp_dest, dest)
|
os.rename(tmp_dest, dest)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if os.path.exists(tmp_dest):
|
# Clean up any temp files on failure
|
||||||
os.remove(tmp_dest)
|
for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]:
|
||||||
|
if p and os.path.exists(p):
|
||||||
|
os.remove(p)
|
||||||
return jsonify({'error': f'Save failed: {e}'}), 500
|
return jsonify({'error': f'Save failed: {e}'}), 500
|
||||||
|
|
||||||
# Register with kiwix-serve library
|
# Register with kiwix-serve library
|
||||||
|
|
@ -2320,24 +2343,11 @@ def api_scraper_submit():
|
||||||
title = data.get('title', '').strip() or None
|
title = data.get('title', '').strip() or None
|
||||||
category = data.get('category', '').strip() or None
|
category = data.get('category', '').strip() or None
|
||||||
|
|
||||||
# Optional per-job reject pattern overrides
|
|
||||||
additional_reject_patterns = data.get('additional_reject_patterns')
|
|
||||||
skip_default_patterns = bool(data.get('skip_default_patterns', False))
|
|
||||||
|
|
||||||
# Optional crawl mode override (static, browser, redirect, or null for auto-detect)
|
|
||||||
crawl_mode = data.get('crawl_mode')
|
|
||||||
if crawl_mode and crawl_mode not in ('static', 'browser', 'redirect'):
|
|
||||||
return jsonify({'error': "crawl_mode must be 'static', 'browser', 'redirect', or null"}), 400
|
|
||||||
|
|
||||||
# Serialize additional patterns as JSON if provided
|
|
||||||
import json as _json
|
|
||||||
additional_json = _json.dumps(additional_reject_patterns) if additional_reject_patterns else None
|
|
||||||
|
|
||||||
db = StatusDB()
|
db = StatusDB()
|
||||||
conn = db._get_conn()
|
conn = db._get_conn()
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO scrape_jobs (url, title, language, category, additional_reject_patterns, skip_default_patterns, crawl_mode) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
"INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)",
|
||||||
(url, title, language, category, additional_json, int(skip_default_patterns), crawl_mode)
|
(url, title, language, category, 'zimit')
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
||||||
|
|
@ -2358,8 +2368,6 @@ def api_scraper_jobs():
|
||||||
@app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
|
@app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
|
||||||
def api_scraper_cancel(job_id):
|
def api_scraper_cancel(job_id):
|
||||||
"""Cancel a scrape job."""
|
"""Cancel a scrape job."""
|
||||||
import os as _os
|
|
||||||
import signal as _signal
|
|
||||||
|
|
||||||
db = StatusDB()
|
db = StatusDB()
|
||||||
job = db.get_scrape_job(job_id)
|
job = db.get_scrape_job(job_id)
|
||||||
|
|
@ -2372,13 +2380,14 @@ def api_scraper_cancel(job_id):
|
||||||
# Set cancelled in DB — the runner loop checks this between phases
|
# Set cancelled in DB — the runner loop checks this between phases
|
||||||
db.update_scrape_job(job_id, status='cancelled')
|
db.update_scrape_job(job_id, status='cancelled')
|
||||||
|
|
||||||
# If there's an active subprocess, send SIGTERM
|
# Stop the Docker container if running
|
||||||
pid = job.get('subprocess_pid')
|
container_name = f'recon-scraper-{job_id}'
|
||||||
if pid:
|
|
||||||
try:
|
try:
|
||||||
_os.kill(pid, _signal.SIGTERM)
|
import subprocess as _subprocess
|
||||||
except (ProcessLookupError, PermissionError):
|
_subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
pass # Process already gone
|
capture_output=True, timeout=10)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
logger.info(f"Scraper job {job_id} cancelled")
|
logger.info(f"Scraper job {job_id} cancelled")
|
||||||
return jsonify({'ok': True})
|
return jsonify({'ok': True})
|
||||||
|
|
|
||||||
|
|
@ -1,27 +1,21 @@
|
||||||
"""
|
"""
|
||||||
RECON Scraper Runner
|
RECON Scraper Runner
|
||||||
|
|
||||||
Daemon loop that processes scrape jobs: crawl → zimwriterfs → kiwix-manage.
|
Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage.
|
||||||
Supports two crawl backends:
|
Zimit (openZIM Docker crawler) handles all site types and produces ZIM
|
||||||
- wget (static sites) — default
|
files directly — no separate zimwriterfs step needed.
|
||||||
- SingleFile CLI (JS-rendered sites) — browser mode
|
|
||||||
|
|
||||||
Pre-flight detection automatically chooses the right backend unless
|
|
||||||
crawl_mode is pre-set on the job.
|
|
||||||
|
|
||||||
Public entry point: scraper_loop(stop_event, config).
|
Public entry point: scraper_loop(stop_event, config).
|
||||||
|
|
||||||
Config section: scraper (workspace, output_dir, rate_limit_delay, preflight, singlefile)
|
Config section: scraper (output_dir, docker_image, docker_workers, poll_interval)
|
||||||
DB table: scrape_jobs (status flow: pending → scraping → packaging → complete)
|
DB table: scrape_jobs (status flow: pending → scraping → registering → complete)
|
||||||
"""
|
"""
|
||||||
import glob as _glob
|
import glob as _glob
|
||||||
import json as _json
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
@ -39,6 +33,9 @@ def scraper_loop(stop_event, config):
|
||||||
|
|
||||||
logger.info("Scraper runner started")
|
logger.info("Scraper runner started")
|
||||||
|
|
||||||
|
# Clean up any orphan Zimit containers from a previous crash
|
||||||
|
_cleanup_orphan_containers()
|
||||||
|
|
||||||
while not stop_event.is_set():
|
while not stop_event.is_set():
|
||||||
db = StatusDB()
|
db = StatusDB()
|
||||||
job = db.get_pending_scrape_job()
|
job = db.get_pending_scrape_job()
|
||||||
|
|
@ -97,314 +94,115 @@ def _kill_process(proc, timeout=5):
|
||||||
proc.wait(timeout=2)
|
proc.wait(timeout=2)
|
||||||
|
|
||||||
|
|
||||||
def _count_html_files(directory):
|
def _cleanup_orphan_containers():
|
||||||
"""Count HTML files in a directory tree."""
|
"""Remove any leftover recon-scraper-* Docker containers from a previous crash."""
|
||||||
count = 0
|
|
||||||
for root, dirs, files in os.walk(directory):
|
|
||||||
for f in files:
|
|
||||||
if f.lower().endswith(('.html', '.htm')):
|
|
||||||
count += 1
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
def _find_welcome_page(content_dir, domain):
|
|
||||||
"""Find the welcome page (index.html) in the wget mirror."""
|
|
||||||
domain_dir = None
|
|
||||||
for entry in os.listdir(content_dir):
|
|
||||||
entry_path = os.path.join(content_dir, entry)
|
|
||||||
if os.path.isdir(entry_path):
|
|
||||||
domain_dir = entry_path
|
|
||||||
break
|
|
||||||
|
|
||||||
if not domain_dir:
|
|
||||||
return None, content_dir
|
|
||||||
|
|
||||||
for candidate in ['index.html', 'index.htm']:
|
|
||||||
path = os.path.join(domain_dir, candidate)
|
|
||||||
if os.path.isfile(path):
|
|
||||||
return candidate, domain_dir
|
|
||||||
|
|
||||||
for root, dirs, files in os.walk(domain_dir):
|
|
||||||
for f in sorted(files):
|
|
||||||
if f.lower().endswith(('.html', '.htm')):
|
|
||||||
rel = os.path.relpath(os.path.join(root, f), domain_dir)
|
|
||||||
return rel, domain_dir
|
|
||||||
|
|
||||||
return 'index.html', domain_dir
|
|
||||||
|
|
||||||
|
|
||||||
def _create_placeholder_illustration(path):
|
|
||||||
"""Create a 48x48 placeholder PNG for zimwriterfs --illustration."""
|
|
||||||
from PIL import Image
|
|
||||||
img = Image.new('RGB', (48, 48), color=(40, 192, 232))
|
|
||||||
img.save(path, 'PNG')
|
|
||||||
|
|
||||||
|
|
||||||
# ── Crawl mode detection ──────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def _get_chromium_path(config):
|
|
||||||
"""Auto-detect Chromium from Playwright's cache, or use config override."""
|
|
||||||
configured = config.get('scraper', {}).get('singlefile', {}).get('chromium_path', '')
|
|
||||||
if configured and os.path.isfile(configured):
|
|
||||||
return configured
|
|
||||||
# Playwright stores Chromium — check both root and user caches
|
|
||||||
search_paths = [
|
|
||||||
os.path.expanduser('~/.cache/ms-playwright/chromium-*/chrome-linux*/chrome'),
|
|
||||||
'/root/.cache/ms-playwright/chromium-*/chrome-linux*/chrome',
|
|
||||||
]
|
|
||||||
for pattern in search_paths:
|
|
||||||
matches = sorted(_glob.glob(pattern))
|
|
||||||
if matches:
|
|
||||||
return matches[-1]
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_crawl_mode(url, config):
|
|
||||||
"""
|
|
||||||
Pre-flight detection: determine whether a URL needs a browser to crawl.
|
|
||||||
|
|
||||||
Returns (mode, resolved_url) where mode is 'static', 'browser', or 'redirect'.
|
|
||||||
'redirect' means the URL redirected to a different domain (parking page etc.);
|
|
||||||
resolved_url will be the final browser URL in that case.
|
|
||||||
"""
|
|
||||||
preflight_cfg = config.get('scraper', {}).get('preflight', {})
|
|
||||||
if not preflight_cfg.get('enabled', True):
|
|
||||||
return 'static', url
|
|
||||||
|
|
||||||
timeout = preflight_cfg.get('timeout', 30)
|
|
||||||
min_static = preflight_cfg.get('min_static_size', 5120)
|
|
||||||
min_browser = preflight_cfg.get('min_browser_size', 20480)
|
|
||||||
spa_markers = preflight_cfg.get('spa_markers', ['div#root', 'div#app', 'div#__next'])
|
|
||||||
|
|
||||||
input_domain = urlparse(url).hostname or ''
|
|
||||||
if input_domain.startswith('www.'):
|
|
||||||
input_domain = input_domain[4:]
|
|
||||||
|
|
||||||
# Step 1: wget single-page fetch
|
|
||||||
wget_html = ''
|
|
||||||
wget_size = 0
|
|
||||||
try:
|
try:
|
||||||
with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp:
|
|
||||||
tmp_path = tmp.name
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['wget', '-q', '-O', tmp_path, '--timeout=30', '--tries=1', url],
|
['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'],
|
||||||
capture_output=True, text=True, timeout=timeout + 5
|
capture_output=True, text=True, timeout=10
|
||||||
)
|
)
|
||||||
if os.path.isfile(tmp_path):
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
wget_size = os.path.getsize(tmp_path)
|
for name in result.stdout.strip().split('\n'):
|
||||||
with open(tmp_path, 'r', errors='replace') as f:
|
name = name.strip()
|
||||||
wget_html = f.read()
|
if name:
|
||||||
os.unlink(tmp_path)
|
subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10)
|
||||||
|
logger.info(f"Cleaned up orphan container: {name}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Preflight wget failed for {url}: {e}")
|
logger.warning(f"Orphan container cleanup failed: {e}")
|
||||||
try:
|
|
||||||
os.unlink(tmp_path)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Step 2: Playwright headless fetch
|
|
||||||
browser_html = ''
|
|
||||||
browser_size = 0
|
|
||||||
browser_url = url
|
|
||||||
try:
|
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
with sync_playwright() as p:
|
|
||||||
browser = p.chromium.launch(
|
|
||||||
headless=True,
|
|
||||||
args=['--no-sandbox', '--disable-dev-shm-usage']
|
|
||||||
)
|
|
||||||
page = browser.new_page()
|
|
||||||
page.goto(url, wait_until='networkidle', timeout=timeout * 1000)
|
|
||||||
browser_url = page.url
|
|
||||||
browser_html = page.content()
|
|
||||||
browser_size = len(browser_html.encode('utf-8'))
|
|
||||||
browser.close()
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Preflight Playwright failed for {url}: {e}")
|
|
||||||
# If Playwright fails entirely, fall back to static
|
|
||||||
return 'static', url
|
|
||||||
|
|
||||||
# Step 3: Decision logic
|
|
||||||
browser_domain = urlparse(browser_url).hostname or ''
|
|
||||||
if browser_domain.startswith('www.'):
|
|
||||||
browser_domain = browser_domain[4:]
|
|
||||||
|
|
||||||
# Check for cross-domain redirect (parking page detection)
|
|
||||||
if browser_domain and input_domain and browser_domain != input_domain:
|
|
||||||
logger.info(f"Preflight: {url} redirected to different domain {browser_domain}, mode=redirect")
|
|
||||||
return 'redirect', browser_url
|
|
||||||
|
|
||||||
# Check size disparity: small wget + large browser = JS-rendered
|
|
||||||
if wget_size < min_static and browser_size > min_browser:
|
|
||||||
logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=browser")
|
|
||||||
return 'browser', url
|
|
||||||
|
|
||||||
# Check for SPA shell markers in wget HTML
|
|
||||||
if wget_html:
|
|
||||||
try:
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
soup = BeautifulSoup(wget_html, 'html.parser')
|
|
||||||
for marker in spa_markers:
|
|
||||||
# marker is like 'div#root' — split tag and id
|
|
||||||
parts = marker.split('#', 1)
|
|
||||||
tag = parts[0] if parts[0] else 'div'
|
|
||||||
elem_id = parts[1] if len(parts) > 1 else None
|
|
||||||
elem = soup.find(tag, id=elem_id) if elem_id else soup.find(tag)
|
|
||||||
if elem:
|
|
||||||
text_content = elem.get_text(strip=True)
|
|
||||||
if len(text_content) < 100:
|
|
||||||
logger.info(f"Preflight: {url} has SPA marker {marker} with {len(text_content)} chars text, mode=browser")
|
|
||||||
return 'browser', url
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Preflight SPA marker check failed: {e}")
|
|
||||||
|
|
||||||
logger.info(f"Preflight: {url} wget={wget_size}B browser={browser_size}B, mode=static")
|
|
||||||
return 'static', url
|
|
||||||
|
|
||||||
|
|
||||||
# ── Crawl backends ────────────────────────────────────────────────
|
# ── Zimit crawl backend ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def _crawl_wget(job, url, site_dir, config, stop_event, db):
|
def _crawl_zimit(job, config, stop_event, db):
|
||||||
"""
|
"""
|
||||||
wget mirror crawl backend.
|
Crawl a URL using Zimit (openZIM Docker crawler).
|
||||||
Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
|
|
||||||
|
Returns (page_count, zim_filename, error_msg).
|
||||||
|
On success: (count, filename, None)
|
||||||
|
On failure: (0, None, error_string)
|
||||||
"""
|
"""
|
||||||
job_id = job['id']
|
job_id = job['id']
|
||||||
|
url = job['url']
|
||||||
|
title = job.get('title') or _sanitize_domain(url)
|
||||||
|
language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
|
||||||
|
category = job.get('category') or ''
|
||||||
|
|
||||||
scraper_cfg = config.get('scraper', {})
|
scraper_cfg = config.get('scraper', {})
|
||||||
rate_limit_delay = scraper_cfg.get('rate_limit_delay', 0.5)
|
output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
|
||||||
user_agent = scraper_cfg.get('user_agent', 'Mozilla/5.0 (compatible; RECON/1.0)')
|
docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit')
|
||||||
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
|
docker_workers = scraper_cfg.get('docker_workers', 2)
|
||||||
workspace = os.path.dirname(site_dir)
|
|
||||||
|
|
||||||
# Build reject-regex from config defaults + per-job overrides
|
|
||||||
reject_patterns = []
|
|
||||||
skip_defaults = bool(job.get('skip_default_patterns'))
|
|
||||||
if not skip_defaults:
|
|
||||||
reject_patterns.extend(scraper_cfg.get('default_reject_patterns', []))
|
|
||||||
additional_raw = job.get('additional_reject_patterns')
|
|
||||||
if additional_raw:
|
|
||||||
try:
|
|
||||||
additional = _json.loads(additional_raw) if isinstance(additional_raw, str) else additional_raw
|
|
||||||
if isinstance(additional, list):
|
|
||||||
reject_patterns.extend(additional)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
wget_cmd = [
|
|
||||||
'wget', '--mirror', '--convert-links', '--adjust-extension',
|
|
||||||
'--page-requisites', '--no-parent',
|
|
||||||
'--restrict-file-names=windows',
|
|
||||||
f'--wait={rate_limit_delay}', '--random-wait',
|
|
||||||
f'--user-agent={user_agent}',
|
|
||||||
f'--directory-prefix={site_dir}',
|
|
||||||
'--timeout=30', '--tries=3',
|
|
||||||
]
|
|
||||||
if reject_patterns:
|
|
||||||
combined_regex = '|'.join(f'({p})' for p in reject_patterns)
|
|
||||||
wget_cmd.extend([f'--reject-regex={combined_regex}'])
|
|
||||||
logger.info(f"Job {job_id}: reject-regex has {len(reject_patterns)} patterns")
|
|
||||||
wget_cmd.append(url)
|
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: wget mirror starting")
|
|
||||||
wget_log = os.path.join(workspace, 'wget.log')
|
|
||||||
try:
|
|
||||||
with open(wget_log, 'w') as log_fh:
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
wget_cmd,
|
|
||||||
stdout=log_fh, stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
|
|
||||||
|
|
||||||
while proc.poll() is None:
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_kill_process(proc)
|
|
||||||
return 0, 'cancelled'
|
|
||||||
try:
|
|
||||||
proc.wait(timeout=5)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
pass
|
|
||||||
|
|
||||||
db.update_scrape_job(job_id, subprocess_pid=None)
|
|
||||||
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
return 0, 'cancelled'
|
|
||||||
|
|
||||||
# wget returns 8 for some server errors but may still have useful content
|
|
||||||
if proc.returncode not in (0, 4, 6, 8):
|
|
||||||
output = ''
|
|
||||||
try:
|
|
||||||
with open(wget_log, 'r') as f:
|
|
||||||
f.seek(max(0, os.path.getsize(wget_log) - 500))
|
|
||||||
output = f.read()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return 0, f"wget failed with code {proc.returncode}: {output[-500:]}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
return 0, f"wget error: {e}"
|
|
||||||
|
|
||||||
page_count = _count_html_files(site_dir)
|
|
||||||
logger.info(f"Job {job_id}: wget complete, {page_count} HTML pages found")
|
|
||||||
|
|
||||||
if page_count == 0:
|
|
||||||
return 0, 'wget produced no HTML files'
|
|
||||||
|
|
||||||
return page_count, None
|
|
||||||
|
|
||||||
|
|
||||||
def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
|
|
||||||
"""
|
|
||||||
SingleFile CLI crawl backend for JS-rendered sites.
|
|
||||||
Returns (page_count, error_msg) — error_msg is None on success, 'cancelled' on cancel.
|
|
||||||
"""
|
|
||||||
job_id = job['id']
|
|
||||||
scraper_cfg = config.get('scraper', {})
|
|
||||||
sf_cfg = scraper_cfg.get('singlefile', {})
|
|
||||||
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
|
|
||||||
workspace = os.path.dirname(site_dir)
|
|
||||||
|
|
||||||
executable = sf_cfg.get('executable', 'single-file')
|
|
||||||
chromium_path = _get_chromium_path(config)
|
|
||||||
crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
|
|
||||||
|
|
||||||
if not chromium_path:
|
|
||||||
return 0, 'Chromium not found — cannot use browser crawl mode'
|
|
||||||
|
|
||||||
# SingleFile outputs into site_dir/<domain>/ to match wget's structure
|
|
||||||
domain = _sanitize_domain(url)
|
domain = _sanitize_domain(url)
|
||||||
output_dir = os.path.join(site_dir, domain)
|
date_tag = datetime.now().strftime('%Y-%m')
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
container_name = f'recon-scraper-{job_id}'
|
||||||
|
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
|
||||||
|
|
||||||
sf_cmd = [
|
# Clean up any pre-existing container with same name (retry scenario)
|
||||||
executable,
|
subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10)
|
||||||
'--crawl-links=true',
|
|
||||||
'--crawl-inner-links-only=true',
|
os.makedirs(tmp_dir, exist_ok=True)
|
||||||
'--crawl-no-parent=true',
|
|
||||||
'--crawl-replace-URLs=true',
|
description = f"Mirror of {domain}"
|
||||||
f'--crawl-max-depth={crawl_max_depth}',
|
if category:
|
||||||
f'--browser-executable-path={chromium_path}',
|
description = f"{category} — mirror of {domain}"
|
||||||
'--browser-headless=true',
|
|
||||||
'--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
|
docker_cmd = [
|
||||||
f'--output-directory={output_dir}',
|
'docker', 'run', '--rm',
|
||||||
url,
|
'--name', container_name,
|
||||||
|
'-v', f'{tmp_dir}:/output',
|
||||||
|
docker_image,
|
||||||
|
'--url', url,
|
||||||
|
'--name', _sanitize_filename(domain),
|
||||||
|
'--lang', language,
|
||||||
|
'--title', title,
|
||||||
|
'--description', description[:80],
|
||||||
|
'--output', '/output',
|
||||||
|
'--workers', str(docker_workers),
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})")
|
logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
|
||||||
sf_log = os.path.join(workspace, 'singlefile.log')
|
|
||||||
try:
|
try:
|
||||||
with open(sf_log, 'w') as log_fh:
|
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
sf_cmd,
|
docker_cmd,
|
||||||
stdout=log_fh, stderr=subprocess.STDOUT,
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
)
|
)
|
||||||
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
|
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
|
||||||
|
|
||||||
|
last_progress_check = 0
|
||||||
while proc.poll() is None:
|
while proc.poll() is None:
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
|
# Stop the Docker container
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
_kill_process(proc)
|
_kill_process(proc)
|
||||||
return 0, 'cancelled'
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, 'cancelled'
|
||||||
|
|
||||||
|
# Check progress every 30s via docker logs
|
||||||
|
now = time.time()
|
||||||
|
if now - last_progress_check >= 30:
|
||||||
|
last_progress_check = now
|
||||||
|
try:
|
||||||
|
log_result = subprocess.run(
|
||||||
|
['docker', 'logs', '--tail', '20', container_name],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
if log_result.returncode == 0 and log_result.stderr:
|
||||||
|
# Zimit/Browsertrix logs page counts — look for numbers
|
||||||
|
lines = log_result.stderr.strip().split('\n')
|
||||||
|
for line in reversed(lines):
|
||||||
|
# Look for patterns like "X pages" or page count indicators
|
||||||
|
match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
count = int(match.group(1))
|
||||||
|
if count > 0:
|
||||||
|
db.update_scrape_job(job_id, page_count=count)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
proc.wait(timeout=5)
|
proc.wait(timeout=5)
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
|
|
@ -413,42 +211,59 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
|
||||||
db.update_scrape_job(job_id, subprocess_pid=None)
|
db.update_scrape_job(job_id, subprocess_pid=None)
|
||||||
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
return 0, 'cancelled'
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, 'cancelled'
|
||||||
|
|
||||||
if proc.returncode != 0:
|
if proc.returncode != 0:
|
||||||
output = ''
|
# Capture last 50 lines of docker logs for error context
|
||||||
|
error_msg = f"Zimit exited with code {proc.returncode}"
|
||||||
try:
|
try:
|
||||||
with open(sf_log, 'r') as f:
|
log_result = subprocess.run(
|
||||||
f.seek(max(0, os.path.getsize(sf_log) - 500))
|
['docker', 'logs', '--tail', '50', container_name],
|
||||||
output = f.read()
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
log_text = (log_result.stderr or log_result.stdout or '').strip()
|
||||||
|
if log_text:
|
||||||
|
# Take last 500 chars
|
||||||
|
error_msg += f": {log_text[-500:]}"
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# SingleFile may still produce some files even with non-zero exit
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
page_count = _count_html_files(site_dir)
|
return 0, None, error_msg
|
||||||
if page_count == 0:
|
|
||||||
return 0, f"SingleFile failed with code {proc.returncode}: {output[-500:]}"
|
|
||||||
logger.warning(f"Job {job_id}: SingleFile exited {proc.returncode} but produced {page_count} pages, continuing")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 0, f"SingleFile error: {e}"
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, f"Zimit error: {e}"
|
||||||
|
|
||||||
# If no index.html exists, rename the first HTML file to index.html
|
# Find the output ZIM file
|
||||||
index_path = os.path.join(output_dir, 'index.html')
|
zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
|
||||||
if not os.path.isfile(index_path):
|
if not zim_files:
|
||||||
for f in sorted(os.listdir(output_dir)):
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
if f.lower().endswith(('.html', '.htm')):
|
return 0, None, 'Zimit produced no ZIM file'
|
||||||
src = os.path.join(output_dir, f)
|
|
||||||
os.rename(src, index_path)
|
|
||||||
logger.info(f"Job {job_id}: renamed {f} → index.html")
|
|
||||||
break
|
|
||||||
|
|
||||||
page_count = _count_html_files(site_dir)
|
src_zim = zim_files[0] # Should be exactly one
|
||||||
logger.info(f"Job {job_id}: SingleFile complete, {page_count} HTML pages found")
|
|
||||||
|
|
||||||
if page_count == 0:
|
# Get page count from file size as rough estimate if we don't have one
|
||||||
return 0, 'SingleFile produced no HTML files'
|
page_count = 0
|
||||||
|
try:
|
||||||
|
job_state = db.get_scrape_job(job_id)
|
||||||
|
page_count = job_state.get('page_count') or 0
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
return page_count, None
|
# Rename to final location
|
||||||
|
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
||||||
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
|
try:
|
||||||
|
shutil.move(src_zim, zim_path)
|
||||||
|
except Exception as e:
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, f"Failed to move ZIM to output dir: {e}"
|
||||||
|
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
logger.info(f"Job {job_id}: Zimit complete — {zim_filename}")
|
||||||
|
|
||||||
|
return page_count, zim_filename, None
|
||||||
|
|
||||||
|
|
||||||
# ── Main job pipeline ─────────────────────────────────────────────
|
# ── Main job pipeline ─────────────────────────────────────────────
|
||||||
|
|
@ -458,183 +273,43 @@ def _process_job(job, config, stop_event):
|
||||||
"""Execute the full scrape pipeline for a single job."""
|
"""Execute the full scrape pipeline for a single job."""
|
||||||
db = StatusDB()
|
db = StatusDB()
|
||||||
job_id = job['id']
|
job_id = job['id']
|
||||||
url = job['url']
|
|
||||||
title = job.get('title') or _sanitize_domain(url)
|
|
||||||
language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
|
|
||||||
category = job.get('category') or ''
|
|
||||||
|
|
||||||
scraper_cfg = config.get('scraper', {})
|
logger.info(f"Job {job_id}: starting scrape of {job['url']}")
|
||||||
workspace_root = scraper_cfg.get('workspace', '/opt/recon/data/scraper')
|
|
||||||
output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
|
|
||||||
keep_workspace = scraper_cfg.get('keep_workspace_on_failure', True)
|
|
||||||
|
|
||||||
workspace = os.path.join(workspace_root, str(job_id))
|
# ── Phase 1: Crawl via Zimit ───────────────────────────────────
|
||||||
site_dir = os.path.join(workspace, 'site')
|
|
||||||
os.makedirs(site_dir, exist_ok=True)
|
|
||||||
|
|
||||||
domain = _sanitize_domain(url)
|
|
||||||
date_tag = datetime.now().strftime('%Y-%m')
|
|
||||||
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
|
||||||
zim_path = os.path.join(output_dir, zim_filename)
|
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: starting scrape of {url}")
|
|
||||||
db.update_scrape_job(job_id,
|
db.update_scrape_job(job_id,
|
||||||
status='scraping',
|
status='scraping',
|
||||||
workspace_path=workspace,
|
crawl_mode='zimit',
|
||||||
started_at=_now())
|
started_at=_now())
|
||||||
|
|
||||||
# ── Phase 0: Pre-flight mode detection ─────────────────────────
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
_handle_cancel(db, job_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
pre_set = job.get('crawl_mode')
|
page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db)
|
||||||
if pre_set:
|
|
||||||
crawl_mode, resolved_url = pre_set, url
|
|
||||||
logger.info(f"Job {job_id}: using pre-set crawl_mode={crawl_mode}")
|
|
||||||
else:
|
|
||||||
crawl_mode, resolved_url = _detect_crawl_mode(url, config)
|
|
||||||
logger.info(f"Job {job_id}: detected crawl_mode={crawl_mode}")
|
|
||||||
|
|
||||||
db.update_scrape_job(job_id, crawl_mode=crawl_mode)
|
|
||||||
|
|
||||||
# If redirect detected, update domain/filename to match resolved URL
|
|
||||||
if crawl_mode == 'redirect' and resolved_url != url:
|
|
||||||
logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
|
|
||||||
domain = _sanitize_domain(resolved_url)
|
|
||||||
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
|
||||||
zim_path = os.path.join(output_dir, zim_filename)
|
|
||||||
|
|
||||||
# ── Phase A: Crawl (dispatch to backend) ────────────────────────
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
|
||||||
return
|
|
||||||
|
|
||||||
if crawl_mode == 'browser':
|
|
||||||
page_count, error = _crawl_singlefile(job, resolved_url, site_dir, config, stop_event, db)
|
|
||||||
else: # 'static' or 'redirect'
|
|
||||||
page_count, error = _crawl_wget(job, resolved_url, site_dir, config, stop_event, db)
|
|
||||||
|
|
||||||
if error == 'cancelled':
|
if error == 'cancelled':
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
_handle_cancel(db, job_id)
|
||||||
return
|
return
|
||||||
elif error:
|
elif error:
|
||||||
db.update_scrape_job(job_id,
|
db.update_scrape_job(job_id,
|
||||||
status='failed',
|
status='failed',
|
||||||
error_message=error,
|
error_message=error[:1000],
|
||||||
subprocess_pid=None,
|
subprocess_pid=None,
|
||||||
completed_at=_now())
|
completed_at=_now())
|
||||||
if not keep_workspace:
|
|
||||||
shutil.rmtree(workspace, ignore_errors=True)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
db.update_scrape_job(job_id, page_count=page_count)
|
db.update_scrape_job(job_id, page_count=page_count)
|
||||||
|
|
||||||
# ── Phase B: Prepare zimwriterfs inputs ────────────────────────
|
# ── Phase 2: Register with kiwix-serve ─────────────────────────
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
_handle_cancel(db, job_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
welcome_page, content_dir = _find_welcome_page(site_dir, domain)
|
db.update_scrape_job(job_id, status='registering')
|
||||||
if welcome_page is None:
|
|
||||||
welcome_page = 'index.html'
|
|
||||||
|
|
||||||
illustration_path = os.path.join(workspace, 'illustration.png')
|
|
||||||
_create_placeholder_illustration(illustration_path)
|
|
||||||
illust_dest = os.path.join(content_dir, 'illustration.png')
|
|
||||||
shutil.copy2(illustration_path, illust_dest)
|
|
||||||
|
|
||||||
description = f"Mirror of {domain}"
|
|
||||||
if category:
|
|
||||||
description = f"{category} — mirror of {domain}"
|
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: packaging ZIM (welcome={welcome_page}, content_dir={content_dir})")
|
|
||||||
db.update_scrape_job(job_id, status='packaging')
|
|
||||||
|
|
||||||
# ── Phase C: zimwriterfs ───────────────────────────────────────
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
|
||||||
return
|
|
||||||
|
|
||||||
zim_name = _sanitize_filename(domain)
|
|
||||||
long_description = f"Offline mirror of {resolved_url} created by RECON web scraper"
|
|
||||||
|
|
||||||
zim_cmd = [
|
|
||||||
'zimwriterfs',
|
|
||||||
f'--welcome={welcome_page}',
|
|
||||||
f'--illustration=illustration.png',
|
|
||||||
f'--language={language}',
|
|
||||||
f'--title={title}',
|
|
||||||
f'--description={description[:80]}',
|
|
||||||
f'--longDescription={long_description[:4096]}',
|
|
||||||
f'--name={zim_name}',
|
|
||||||
f'--creator={domain}',
|
|
||||||
'--publisher=RECON',
|
|
||||||
content_dir,
|
|
||||||
zim_path,
|
|
||||||
]
|
|
||||||
|
|
||||||
zim_log = os.path.join(workspace, 'zimwriterfs.log')
|
|
||||||
try:
|
|
||||||
with open(zim_log, 'w') as log_fh:
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
zim_cmd,
|
|
||||||
stdout=log_fh, stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
|
|
||||||
|
|
||||||
while proc.poll() is None:
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_kill_process(proc)
|
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
proc.wait(timeout=5)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
pass
|
|
||||||
|
|
||||||
db.update_scrape_job(job_id, subprocess_pid=None)
|
|
||||||
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
|
||||||
return
|
|
||||||
|
|
||||||
if proc.returncode != 0:
|
|
||||||
output = ''
|
|
||||||
try:
|
|
||||||
with open(zim_log, 'r') as f:
|
|
||||||
f.seek(max(0, os.path.getsize(zim_log) - 500))
|
|
||||||
output = f.read()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
raise RuntimeError(f"zimwriterfs failed with code {proc.returncode}: {output[-500:]}")
|
|
||||||
|
|
||||||
except RuntimeError:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
db.update_scrape_job(job_id,
|
|
||||||
status='failed',
|
|
||||||
error_message=f"zimwriterfs error: {e}",
|
|
||||||
subprocess_pid=None,
|
|
||||||
completed_at=_now())
|
|
||||||
if not keep_workspace:
|
|
||||||
shutil.rmtree(workspace, ignore_errors=True)
|
|
||||||
return
|
|
||||||
|
|
||||||
if not os.path.isfile(zim_path):
|
|
||||||
db.update_scrape_job(job_id,
|
|
||||||
status='failed',
|
|
||||||
error_message='zimwriterfs produced no output file',
|
|
||||||
completed_at=_now())
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: ZIM created at {zim_path}")
|
|
||||||
|
|
||||||
# ── Phase D: kiwix-manage + registration ───────────────────────
|
|
||||||
if stop_event.is_set() or _check_cancelled(db, job_id):
|
|
||||||
_handle_cancel(db, job_id, workspace, keep_workspace)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix')
|
||||||
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
|
kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
|
||||||
library_xml = '/mnt/kiwix/library.xml'
|
library_xml = '/mnt/kiwix/library.xml'
|
||||||
|
|
||||||
|
|
@ -670,26 +345,32 @@ def _process_job(job, config, stop_event):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Job {job_id}: scan_zims failed: {e}")
|
logger.warning(f"Job {job_id}: scan_zims failed: {e}")
|
||||||
|
|
||||||
try:
|
# ── Phase 3: Complete ──────────────────────────────────────────
|
||||||
shutil.rmtree(workspace, ignore_errors=True)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
db.update_scrape_job(job_id,
|
db.update_scrape_job(job_id,
|
||||||
status='complete',
|
status='complete',
|
||||||
zim_filename=zim_filename,
|
zim_filename=zim_filename,
|
||||||
zim_source_id=zim_source_id,
|
zim_source_id=zim_source_id,
|
||||||
completed_at=_now())
|
completed_at=_now())
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages, mode={crawl_mode})")
|
logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)")
|
||||||
|
|
||||||
|
|
||||||
def _handle_cancel(db, job_id, workspace, keep_workspace):
|
def _handle_cancel(db, job_id):
|
||||||
"""Handle job cancellation: clean up and update status."""
|
"""Handle job cancellation: clean up Docker container and update status."""
|
||||||
|
container_name = f'recon-scraper-{job_id}'
|
||||||
|
try:
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean up tmp dir if it exists
|
||||||
|
output_dir = '/mnt/kiwix'
|
||||||
|
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: cancelled")
|
logger.info(f"Job {job_id}: cancelled")
|
||||||
db.update_scrape_job(job_id,
|
db.update_scrape_job(job_id,
|
||||||
status='cancelled',
|
status='cancelled',
|
||||||
subprocess_pid=None,
|
subprocess_pid=None,
|
||||||
completed_at=_now())
|
completed_at=_now())
|
||||||
if not keep_workspace:
|
|
||||||
shutil.rmtree(workspace, ignore_errors=True)
|
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@
|
||||||
jobs.forEach(function(j) {
|
jobs.forEach(function(j) {
|
||||||
if (j.status === 'complete') complete++;
|
if (j.status === 'complete') complete++;
|
||||||
else if (j.status === 'failed' || j.status === 'cancelled') failed++;
|
else if (j.status === 'failed' || j.status === 'cancelled') failed++;
|
||||||
else if (j.status === 'running' || j.status === 'pending') active++;
|
else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++;
|
||||||
});
|
});
|
||||||
RECON.set('sc-total', RECON.fmt(total));
|
RECON.set('sc-total', RECON.fmt(total));
|
||||||
RECON.set('sc-active', RECON.fmt(active));
|
RECON.set('sc-active', RECON.fmt(active));
|
||||||
|
|
@ -27,14 +27,12 @@
|
||||||
var html = '';
|
var html = '';
|
||||||
jobs.forEach(function(j) {
|
jobs.forEach(function(j) {
|
||||||
var badge = statusBadge(j.status);
|
var badge = statusBadge(j.status);
|
||||||
var mode = j.crawl_mode ?
|
|
||||||
'<span class="text-small">' + j.crawl_mode + '</span>' : '<span class="text-muted">\u2014</span>';
|
|
||||||
var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
|
var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
|
||||||
var zim = j.zim_filename ?
|
var zim = j.zim_filename ?
|
||||||
'<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
|
'<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
|
||||||
var actions = '';
|
var actions = '';
|
||||||
|
|
||||||
if (j.status === 'running' || j.status === 'pending') {
|
if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') {
|
||||||
actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
|
actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
|
||||||
} else if (j.status === 'failed' || j.status === 'cancelled') {
|
} else if (j.status === 'failed' || j.status === 'cancelled') {
|
||||||
actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
|
actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
|
||||||
|
|
@ -50,14 +48,13 @@
|
||||||
'<td>' + j.id + '</td>' +
|
'<td>' + j.id + '</td>' +
|
||||||
'<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
|
'<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
|
||||||
'<td>' + escHtml(j.title || '\u2014') + '</td>' +
|
'<td>' + escHtml(j.title || '\u2014') + '</td>' +
|
||||||
'<td>' + mode + '</td>' +
|
|
||||||
'<td>' + pages + '</td>' +
|
'<td>' + pages + '</td>' +
|
||||||
'<td>' + badge + errorTooltip(j) + '</td>' +
|
'<td>' + badge + errorTooltip(j) + '</td>' +
|
||||||
'<td>' + zim + '</td>' +
|
'<td>' + zim + '</td>' +
|
||||||
'<td>' + actions + '</td>' +
|
'<td>' + actions + '</td>' +
|
||||||
'</tr>';
|
'</tr>';
|
||||||
});
|
});
|
||||||
if (!html) html = '<tr><td colspan="8" class="text-muted">No scrape jobs</td></tr>';
|
if (!html) html = '<tr><td colspan="7" class="text-muted">No scrape jobs</td></tr>';
|
||||||
RECON.setHTML('sc-table-body', html);
|
RECON.setHTML('sc-table-body', html);
|
||||||
}).catch(function(err) {
|
}).catch(function(err) {
|
||||||
console.error('Scraper dashboard error:', err);
|
console.error('Scraper dashboard error:', err);
|
||||||
|
|
@ -67,7 +64,8 @@
|
||||||
function statusBadge(status) {
|
function statusBadge(status) {
|
||||||
var map = {
|
var map = {
|
||||||
'pending': '<span class="badge-detected">PENDING</span>',
|
'pending': '<span class="badge-detected">PENDING</span>',
|
||||||
'running': '<span class="badge-processing">RUNNING</span>',
|
'scraping': '<span class="badge-processing">SCRAPING</span>',
|
||||||
|
'registering': '<span class="badge-processing">REGISTERING</span>',
|
||||||
'complete': '<span class="badge-complete">COMPLETE</span>',
|
'complete': '<span class="badge-complete">COMPLETE</span>',
|
||||||
'failed': '<span class="badge-failed">FAILED</span>',
|
'failed': '<span class="badge-failed">FAILED</span>',
|
||||||
'cancelled': '<span class="badge-detected">CANCELLED</span>'
|
'cancelled': '<span class="badge-detected">CANCELLED</span>'
|
||||||
|
|
@ -98,12 +96,9 @@
|
||||||
var title = document.getElementById('sf-title').value.trim();
|
var title = document.getElementById('sf-title').value.trim();
|
||||||
var lang = document.getElementById('sf-lang').value;
|
var lang = document.getElementById('sf-lang').value;
|
||||||
var category = document.getElementById('sf-category').value.trim();
|
var category = document.getElementById('sf-category').value.trim();
|
||||||
var mode = document.getElementById('sf-mode').value;
|
|
||||||
|
|
||||||
if (title) body.title = title;
|
if (title) body.title = title;
|
||||||
if (lang) body.language = lang;
|
if (lang) body.language = lang;
|
||||||
if (category) body.category = category;
|
if (category) body.category = category;
|
||||||
if (mode) body.crawl_mode = mode;
|
|
||||||
|
|
||||||
var btn = document.getElementById('sf-submit-btn');
|
var btn = document.getElementById('sf-submit-btn');
|
||||||
var feedback = document.getElementById('sf-feedback');
|
var feedback = document.getElementById('sf-feedback');
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@
|
||||||
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div style="display:grid;grid-template-columns:1fr 1fr 1fr auto;gap:12px;align-items:end;">
|
<div style="display:grid;grid-template-columns:1fr 1fr auto;gap:12px;align-items:end;">
|
||||||
<div>
|
<div>
|
||||||
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
|
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
|
||||||
<select id="sf-lang"
|
<select id="sf-lang"
|
||||||
|
|
@ -38,15 +38,6 @@
|
||||||
<input type="text" id="sf-category" placeholder="Optional"
|
<input type="text" id="sf-category" placeholder="Optional"
|
||||||
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
</div>
|
</div>
|
||||||
<div>
|
|
||||||
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Crawl Mode</label>
|
|
||||||
<select id="sf-mode"
|
|
||||||
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
|
||||||
<option value="" selected>Auto-detect</option>
|
|
||||||
<option value="static">Static (wget)</option>
|
|
||||||
<option value="browser">Browser (SingleFile)</option>
|
|
||||||
</select>
|
|
||||||
</div>
|
|
||||||
<div>
|
<div>
|
||||||
<button type="submit" class="btn" id="sf-submit-btn">Submit</button>
|
<button type="submit" class="btn" id="sf-submit-btn">Submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -75,7 +66,6 @@
|
||||||
<th>ID</th>
|
<th>ID</th>
|
||||||
<th>URL</th>
|
<th>URL</th>
|
||||||
<th>Title</th>
|
<th>Title</th>
|
||||||
<th>Mode</th>
|
|
||||||
<th>Pages</th>
|
<th>Pages</th>
|
||||||
<th>Status</th>
|
<th>Status</th>
|
||||||
<th>ZIM</th>
|
<th>ZIM</th>
|
||||||
|
|
@ -83,7 +73,7 @@
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody id="sc-table-body">
|
<tbody id="sc-table-body">
|
||||||
<tr><td colspan="8" class="text-muted">Loading...</td></tr>
|
<tr><td colspan="7" class="text-muted">Loading...</td></tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue