Fix ZIM filename collisions by appending job ID

Format: {domain}_{lang}_{YYYY-MM}_{job_id}.zim
Prevents zimwriterfs failures when the same domain is scraped
multiple times in the same month.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-18 20:17:53 +00:00
commit 45b954fccc

View file

@ -474,7 +474,7 @@ def _process_job(job, config, stop_event):
domain = _sanitize_domain(url) domain = _sanitize_domain(url)
date_tag = datetime.now().strftime('%Y-%m') date_tag = datetime.now().strftime('%Y-%m')
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
zim_path = os.path.join(output_dir, zim_filename) zim_path = os.path.join(output_dir, zim_filename)
logger.info(f"Job {job_id}: starting scrape of {url}") logger.info(f"Job {job_id}: starting scrape of {url}")
@ -502,7 +502,7 @@ def _process_job(job, config, stop_event):
if crawl_mode == 'redirect' and resolved_url != url: if crawl_mode == 'redirect' and resolved_url != url:
logger.info(f"Job {job_id}: URL resolved from {url}{resolved_url}") logger.info(f"Job {job_id}: URL resolved from {url}{resolved_url}")
domain = _sanitize_domain(resolved_url) domain = _sanitize_domain(resolved_url)
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}.zim" zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
zim_path = os.path.join(output_dir, zim_filename) zim_path = os.path.join(output_dir, zim_filename)
# ── Phase A: Crawl (dispatch to backend) ──────────────────────── # ── Phase A: Crawl (dispatch to backend) ────────────────────────