mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Replace wget/SingleFile/Playwright backends with Zimit
- Zimit Docker container handles all site types (static, SPA, JS redirects) - Removed: _detect_crawl_mode, _crawl_wget, _crawl_singlefile, preflight logic - Added: _crawl_zimit() with Docker lifecycle management - Simplified pipeline: submit → Zimit crawl → kiwix-manage register → done - No more zimwriterfs step — Zimit produces ZIM directly - Dashboard UI simplified: removed crawl mode dropdown - Config simplified: removed reject patterns, preflight, singlefile sections Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f0b160ef7c
commit
8945c82e3f
5 changed files with 212 additions and 606 deletions
75
config.yaml
75
config.yaml
|
|
@ -414,81 +414,12 @@ peertube:
|
|||
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
||||
|
||||
scraper:
|
||||
workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds
|
||||
workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output)
|
||||
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
|
||||
rate_limit_delay: 0.5 # Seconds between wget requests (--wait)
|
||||
wait_random: 1.0 # Random jitter added to wait (--random-wait range)
|
||||
default_language: eng # ISO 639-3 language code for ZIM metadata
|
||||
user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
|
||||
poll_interval: 300 # Seconds between checking for pending scrape jobs
|
||||
keep_workspace_on_failure: true # Retain workspace for debugging when a job fails
|
||||
|
||||
# Default URL patterns rejected by wget --reject-regex.
|
||||
# Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
|
||||
# Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
|
||||
default_reject_patterns:
|
||||
# WordPress
|
||||
- '\?share='
|
||||
- '\?replytocom='
|
||||
- '\?like_comment='
|
||||
- '/feed/'
|
||||
- '/wp-json/'
|
||||
- '/wp-login'
|
||||
- '/wp-admin'
|
||||
- '/wp-cron'
|
||||
- '\?attachment_id='
|
||||
- '/xmlrpc'
|
||||
- '/trackback'
|
||||
- '/comment-page-'
|
||||
- '\?doing_wp_cron'
|
||||
# Squarespace
|
||||
- '\?format=json'
|
||||
- '\?format=rss'
|
||||
- '/api/'
|
||||
# Wix
|
||||
- '/_api/'
|
||||
- '/_partials/'
|
||||
# Ghost
|
||||
- '/ghost/'
|
||||
- '/p/'
|
||||
# Drupal
|
||||
- '\?q=comment'
|
||||
- '\?q=node'
|
||||
- '/user/login'
|
||||
- '/user/register'
|
||||
# General CMS / site chrome
|
||||
- '/login'
|
||||
- '/signup'
|
||||
- '/register'
|
||||
- '/cart'
|
||||
- '/checkout'
|
||||
- '/search\?'
|
||||
- '/tag/'
|
||||
- '/author/'
|
||||
- '\?print='
|
||||
- '\?pdf='
|
||||
- '\?format=amp'
|
||||
- '\?preview='
|
||||
- '/rss'
|
||||
- '/atom'
|
||||
- '/cdn-cgi/'
|
||||
|
||||
# Pre-flight mode detection
|
||||
preflight:
|
||||
enabled: true
|
||||
timeout: 30 # Seconds for single-page Playwright fetch
|
||||
min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site
|
||||
min_browser_size: 20480 # 20KB - browser HTML above this confirms JS
|
||||
spa_markers:
|
||||
- 'div#root'
|
||||
- 'div#app'
|
||||
- 'div#__next'
|
||||
|
||||
# SingleFile CLI settings (browser crawl mode)
|
||||
singlefile:
|
||||
executable: single-file
|
||||
chromium_path: "/usr/bin/chromium-browser"
|
||||
crawl_max_depth: 10
|
||||
docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling
|
||||
docker_workers: 2 # Concurrent crawl workers inside Zimit container
|
||||
|
||||
# Stream B: New Library Pipeline
|
||||
new_pipeline:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue