mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Add scraper Phase 2: smart crawl mode detection + browser fallback
- Pre-flight detection: wget + Playwright probe to auto-detect if site needs browser rendering (JS apps, parking page redirects) - SingleFile CLI crawl backend for JS-rendered sites - crawl_mode column in scrape_jobs (static/browser/redirect/auto) - API: optional crawl_mode param on submit, cleared on retry - Config: rate_limit_delay 2.0→0.5, /api/ reject pattern, preflight + singlefile config sections - Prerequisites: Node.js 22, single-file-cli, Playwright + Chromium Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
491a4350fc
commit
da50e5f0b8
5 changed files with 977 additions and 0 deletions
11
recon.py
11
recon.py
|
|
@ -692,12 +692,23 @@ def cmd_service(args):
|
|||
daemon=True, name='dashboard'),
|
||||
]
|
||||
|
||||
# Scraper daemon: polls for pending scrape jobs, runs wget+zimwriterfs pipeline
|
||||
scraper_cfg = config.get('scraper', {})
|
||||
if scraper_cfg.get('workspace'):
|
||||
from lib.scraper_runner import scraper_loop
|
||||
threads.append(
|
||||
threading.Thread(target=lambda: scraper_loop(stop_event, config),
|
||||
daemon=True, name='scraper')
|
||||
)
|
||||
|
||||
logger.info("=== RECON Service Starting ===")
|
||||
logger.info(f" Dashboard: {web_host}:{web_port}")
|
||||
logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}")
|
||||
logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
|
||||
pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
|
||||
logger.info(f" PeerTube acquisition: every {pt_interval}s")
|
||||
if scraper_cfg.get('workspace'):
|
||||
logger.info(f" Scraper: every {scraper_cfg.get('poll_interval', 300)}s")
|
||||
logger.info(f" Progress: every {progress_interval}s")
|
||||
|
||||
for t in threads:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue