From 96920447900037310efcc0a30cb45d1a1c7effe5 Mon Sep 17 00:00:00 2001
From: Matt <matt@echo6.co>
Date: Sun, 19 Apr 2026 19:33:50 +0000
Subject: [PATCH] Fix progress parsing for Browsertrix JSON log format

Parse "crawled":N from Browsertrix crawlStatus JSON logs instead of
looking for "N pages" pattern. Also check stdout (not just stderr).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lib/scraper_runner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/scraper_runner.py b/lib/scraper_runner.py
index eb50695..d6b0299 100644
--- a/lib/scraper_runner.py
+++ b/lib/scraper_runner.py
@@ -190,12 +190,12 @@ def _crawl_zimit(job, config, stop_event, db):
                         ['docker', 'logs', '--tail', '20', container_name],
                         capture_output=True, text=True, timeout=10
                     )
-                    if log_result.returncode == 0 and log_result.stderr:
-                        # Zimit/Browsertrix logs page counts — look for numbers
-                        lines = log_result.stderr.strip().split('\n')
+                    if log_result.returncode == 0:
+                        # Browsertrix logs JSON with "crawled":N — check both stdout and stderr
+                        log_text = log_result.stdout or log_result.stderr or ''
+                        lines = log_text.strip().split('\n')
                         for line in reversed(lines):
-                            # Look for patterns like "X pages" or page count indicators
-                            match = re.search(r'(\d+)\s+page', line, re.IGNORECASE)
+                            match = re.search(r'"crawled":(\d+)', line)
                             if match:
                                 count = int(match.group(1))
                                 if count > 0: