mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Fix ZIM filename collisions by appending job ID
Format: {domain}_{lang}_{YYYY-MM}_{job_id}.zim
Prevents zimwriterfs failures when the same domain is scraped
multiple times in the same month.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
125602fa13
commit
45b954fccc
1 changed files with 2 additions and 2 deletions
|
|
@ -474,7 +474,7 @@ def _process_job(job, config, stop_event):
|
||||||
|
|
||||||
domain = _sanitize_domain(url)
|
domain = _sanitize_domain(url)
|
||||||
date_tag = datetime.now().strftime('%Y-%m')
|
date_tag = datetime.now().strftime('%Y-%m')
|
||||||
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
|
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
||||||
zim_path = os.path.join(output_dir, zim_filename)
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
|
|
||||||
logger.info(f"Job {job_id}: starting scrape of {url}")
|
logger.info(f"Job {job_id}: starting scrape of {url}")
|
||||||
|
|
@ -502,7 +502,7 @@ def _process_job(job, config, stop_event):
|
||||||
if crawl_mode == 'redirect' and resolved_url != url:
|
if crawl_mode == 'redirect' and resolved_url != url:
|
||||||
logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
|
logger.info(f"Job {job_id}: URL resolved from {url} → {resolved_url}")
|
||||||
domain = _sanitize_domain(resolved_url)
|
domain = _sanitize_domain(resolved_url)
|
||||||
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim"
|
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
||||||
zim_path = os.path.join(output_dir, zim_filename)
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
|
|
||||||
# ── Phase A: Crawl (dispatch to backend) ────────────────────────
|
# ── Phase A: Crawl (dispatch to backend) ────────────────────────
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue