From 125602fa1369ea0b1ec7a98406e2321473e428d1 Mon Sep 17 00:00:00 2001 From: Matt Date: Sat, 18 Apr 2026 19:28:03 +0000 Subject: [PATCH] Fix SingleFile CLI: remove invalid --crawl-delay flag SingleFile CLI has no --crawl-delay option. The invalid flag caused the process to print help and exit with no output. Added --crawl-no-parent and --crawl-replace-URLs instead. Removed unused crawl_delay config key. Co-Authored-By: Claude Opus 4.6 --- config.yaml | 1 - lib/scraper_runner.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/config.yaml b/config.yaml index c98a866..bdabf69 100644 --- a/config.yaml +++ b/config.yaml @@ -489,7 +489,6 @@ scraper: executable: single-file chromium_path: "" # Auto-detected from Playwright if empty crawl_max_depth: 10 - crawl_delay: 2 # Seconds between page fetches # Stream B: New Library Pipeline new_pipeline: diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py index 1599f2e..a3ff820 100644 --- a/lib/scraper_runner.py +++ b/lib/scraper_runner.py @@ -368,7 +368,6 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): executable = sf_cfg.get('executable', 'single-file') chromium_path = _get_chromium_path(config) crawl_max_depth = sf_cfg.get('crawl_max_depth', 10) - crawl_delay = sf_cfg.get('crawl_delay', 2) if not chromium_path: return 0, 'Chromium not found — cannot use browser crawl mode' @@ -382,8 +381,9 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): executable, '--crawl-links=true', '--crawl-inner-links-only=true', + '--crawl-no-parent=true', + '--crawl-replace-URLs=true', f'--crawl-max-depth={crawl_max_depth}', - f'--crawl-delay={crawl_delay * 1000}', # milliseconds f'--browser-executable-path={chromium_path}', '--browser-headless=true', '--browser-args=["--no-sandbox","--disable-dev-shm-usage"]', @@ -391,7 +391,7 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db): url, ] - logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)") + logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})") sf_log = os.path.join(workspace, 'singlefile.log') try: with open(sf_log, 'w') as log_fh: