mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Fix progress parsing for Browsertrix JSON log format
Parse "crawled":N from Browsertrix crawlStatus JSON logs instead of looking for "N pages" pattern. Also check stdout (not just stderr). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b035ba3f20
commit
9692044790
1 changed files with 5 additions and 5 deletions
|
|
@ -190,12 +190,12 @@ def _crawl_zimit(job, config, stop_event, db):
|
||||||
['docker', 'logs', '--tail', '20', container_name],
|
['docker', 'logs', '--tail', '20', container_name],
|
||||||
capture_output=True, text=True, timeout=10
|
capture_output=True, text=True, timeout=10
|
||||||
)
|
)
|
||||||
if log_result.returncode == 0 and log_result.stderr:
|
if log_result.returncode == 0:
|
||||||
# Zimit/Browsertrix logs page counts — look for numbers
|
# Browsertrix logs JSON with "crawled":N — check both stdout and stderr
|
||||||
lines = log_result.stderr.strip().split('\n')
|
log_text = log_result.stdout or log_result.stderr or ''
|
||||||
|
lines = log_text.strip().split('\n')
|
||||||
for line in reversed(lines):
|
for line in reversed(lines):
|
||||||
# Look for patterns like "X pages" or page count indicators
|
match = re.search(r'"crawled":(\d+)', line)
|
||||||
match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
|
|
||||||
if match:
|
if match:
|
||||||
count = int(match.group(1))
|
count = int(match.group(1))
|
||||||
if count > 0:
|
if count > 0:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue