Add scraper Phase 2: smart crawl mode detection + browser fallback

- Pre-flight detection: wget + Playwright probe to auto-detect if site
  needs browser rendering (JS apps, parking page redirects)
- SingleFile CLI crawl backend for JS-rendered sites
- crawl_mode column in scrape_jobs (static/browser/redirect/auto)
- API: optional crawl_mode param on submit, cleared on retry
- Config: rate_limit_delay 2.0→0.5, /api/ reject pattern, preflight
  + singlefile config sections
- Prerequisites: Node.js 22, single-file-cli, Playwright + Chromium

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-18 18:26:43 +00:00
commit da50e5f0b8
5 changed files with 977 additions and 0 deletions

View file

@ -692,12 +692,23 @@ def cmd_service(args):
daemon=True, name='dashboard'),
]
# Scraper daemon: polls for pending scrape jobs, runs wget+zimwriterfs pipeline
scraper_cfg = config.get('scraper', {})
if scraper_cfg.get('workspace'):
from lib.scraper_runner import scraper_loop
threads.append(
threading.Thread(target=lambda: scraper_loop(stop_event, config),
daemon=True, name='scraper')
)
logger.info("=== RECON Service Starting ===")
logger.info(f" Dashboard: {web_host}:{web_port}")
logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}")
logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
logger.info(f" PeerTube acquisition: every {pt_interval}s")
if scraper_cfg.get('workspace'):
logger.info(f" Scraper: every {scraper_cfg.get('poll_interval', 300)}s")
logger.info(f" Progress: every {progress_interval}s")
for t in threads: