mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Fix SingleFile CLI: remove invalid --crawl-delay flag
SingleFile CLI has no --crawl-delay option. The invalid flag caused the process to print help and exit with no output. Added --crawl-no-parent and --crawl-replace-URLs instead. Removed unused crawl_delay config key. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
da50e5f0b8
commit
125602fa13
2 changed files with 3 additions and 4 deletions
|
|
@ -489,7 +489,6 @@ scraper:
|
||||||
executable: single-file
|
executable: single-file
|
||||||
chromium_path: "" # Auto-detected from Playwright if empty
|
chromium_path: "" # Auto-detected from Playwright if empty
|
||||||
crawl_max_depth: 10
|
crawl_max_depth: 10
|
||||||
crawl_delay: 2 # Seconds between page fetches
|
|
||||||
|
|
||||||
# Stream B: New Library Pipeline
|
# Stream B: New Library Pipeline
|
||||||
new_pipeline:
|
new_pipeline:
|
||||||
|
|
|
||||||
|
|
@ -368,7 +368,6 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
|
||||||
executable = sf_cfg.get('executable', 'single-file')
|
executable = sf_cfg.get('executable', 'single-file')
|
||||||
chromium_path = _get_chromium_path(config)
|
chromium_path = _get_chromium_path(config)
|
||||||
crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
|
crawl_max_depth = sf_cfg.get('crawl_max_depth', 10)
|
||||||
crawl_delay = sf_cfg.get('crawl_delay', 2)
|
|
||||||
|
|
||||||
if not chromium_path:
|
if not chromium_path:
|
||||||
return 0, 'Chromium not found — cannot use browser crawl mode'
|
return 0, 'Chromium not found — cannot use browser crawl mode'
|
||||||
|
|
@ -382,8 +381,9 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
|
||||||
executable,
|
executable,
|
||||||
'--crawl-links=true',
|
'--crawl-links=true',
|
||||||
'--crawl-inner-links-only=true',
|
'--crawl-inner-links-only=true',
|
||||||
|
'--crawl-no-parent=true',
|
||||||
|
'--crawl-replace-URLs=true',
|
||||||
f'--crawl-max-depth={crawl_max_depth}',
|
f'--crawl-max-depth={crawl_max_depth}',
|
||||||
f'--crawl-delay={crawl_delay * 1000}', # milliseconds
|
|
||||||
f'--browser-executable-path={chromium_path}',
|
f'--browser-executable-path={chromium_path}',
|
||||||
'--browser-headless=true',
|
'--browser-headless=true',
|
||||||
'--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
|
'--browser-args=["--no-sandbox","--disable-dev-shm-usage"]',
|
||||||
|
|
@ -391,7 +391,7 @@ def _crawl_singlefile(job, url, site_dir, config, stop_event, db):
|
||||||
url,
|
url,
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth}, delay={crawl_delay}s)")
|
logger.info(f"Job {job_id}: SingleFile crawl starting (depth={crawl_max_depth})")
|
||||||
sf_log = os.path.join(workspace, 'singlefile.log')
|
sf_log = os.path.join(workspace, 'singlefile.log')
|
||||||
try:
|
try:
|
||||||
with open(sf_log, 'w') as log_fh:
|
with open(sf_log, 'w') as log_fh:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue