Fix progress parsing for Browsertrix JSON log format

Parse "crawled":N from Browsertrix crawlStatus JSON logs instead of
looking for "N pages" pattern. Also check stdout (not just stderr).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-19 19:33:50 +00:00
commit 9692044790

View file

@ -190,12 +190,12 @@ def _crawl_zimit(job, config, stop_event, db):
['docker', 'logs', '--tail', '20', container_name], ['docker', 'logs', '--tail', '20', container_name],
capture_output=True, text=True, timeout=10 capture_output=True, text=True, timeout=10
) )
if log_result.returncode == 0 and log_result.stderr: if log_result.returncode == 0:
# Zimit/Browsertrix logs page counts — look for numbers # Browsertrix logs JSON with "crawled":N — check both stdout and stderr
lines = log_result.stderr.strip().split('\n') log_text = log_result.stdout or log_result.stderr or ''
lines = log_text.strip().split('\n')
for line in reversed(lines): for line in reversed(lines):
# Look for patterns like "X pages" or page count indicators match = re.search(r'"crawled":(\d+)', line)
match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
if match: if match:
count = int(match.group(1)) count = int(match.group(1))
if count > 0: if count > 0: