#!/usr/bin/env python3 """ repair_corrupted.py — Repairs window files corrupted by concurrent writes. Strategy: 1. Read corrupted_windows.txt to get the list of bad files 2. For each bad file, identify the parent doc hash from the path 3. Check if the text directory still exists for that doc 4. If yes: re-run Gemini enrichment on just that window 5. If no text: mark as unrecoverable 6. Report summary Usage: python3 /opt/recon/scripts/repair_corrupted.py [--dry-run] [--workers 8] """ import json import time import random import logging import argparse import re import threading from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from collections import defaultdict import google.generativeai as genai CORRUPTED_LIST = Path("/opt/recon/data/corrupted_windows.txt") TEXT_DIR = Path("/opt/recon/data/text") CONCEPTS_DIR = Path("/opt/recon/data/concepts") LOG_FILE = Path("/opt/recon/logs/repair_corrupted.log") UNRECOVERABLE_LOG = Path("/opt/recon/data/unrecoverable_windows.txt") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler(), ] ) log = logging.getLogger("repair_corrupted") CANONICAL_DOMAINS = [ "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems", "Foundational Skills", "Communications", "Medical", "Food Systems", "Navigation", "Logistics", "Power Systems", "Leadership", "Scenario Playbooks", "Water Systems", "Security", "Community Coordination" ] ENRICH_PROMPT = """Extract knowledge concepts from this document text. A concept is a SELF-CONTAINED piece of knowledge that can stand alone. For each concept, provide ALL fields: Required: - content: Full text of the concept (complete procedure, definition, etc.) - summary: 1-2 sentence summary - title: Brief descriptive title - domain: Array of 1-5 from ONLY these exact strings (no others): Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills, Communications, Medical, Food Systems, Navigation, Logistics, Power Systems, Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination CRITICAL: Do NOT use "Reference". Every concept belongs somewhere specific. - subdomain: Array of specific subcategories (up to 10) - keywords: Array of 3-30 searchable terms - skill_level: novice | intermediate | advanced - key_facts: Array of specific extractable claims, measurements, data points Optional (include when present): - scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki - cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination - chapter: Chapter name if identifiable - page_ref: Page reference Return JSON array. If no extractable concepts, return []. Document text: """ def load_gemini_keys(): env = Path("/opt/recon/.env") keys = [] for line in env.read_text().splitlines(): if line.startswith("GEMINI_KEY_"): keys.append(line.split("=", 1)[1].strip()) return keys class KeyRotator: def __init__(self, keys): self.keys = keys self._i = 0 self._lock = threading.Lock() def next(self): with self._lock: key = self.keys[self._i % len(self.keys)] self._i += 1 return key def repair_json_truncated(text): """Last-ditch attempt to salvage a truncated JSON array.""" text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text) text = re.sub(r',\s*([}\]])', r'\1', text) try: return json.loads(text) except Exception: pass # Find last complete object last_close = -1 depth = 0 in_str = False esc = False for i, ch in enumerate(text): if esc: esc = False; continue if ch == '\\' and in_str: esc = True; continue if ch == '"' and not esc: in_str = not in_str; continue if in_str: continue if ch == '{': depth += 1 elif ch == '}': depth -= 1 if depth == 0: last_close = i if last_close > 0: trimmed = text[:last_close + 1].rstrip().rstrip(',') open_brackets = trimmed.count('[') - trimmed.count(']') try: return json.loads(trimmed + ']' * open_brackets) except Exception: pass return None def enrich_window_text(text, key): """Call Gemini on raw window text, return concepts list.""" genai.configure(api_key=key) model = genai.GenerativeModel( "gemini-2.0-flash", generation_config={"response_mime_type": "application/json"} ) for attempt in range(4): try: resp = model.generate_content(ENRICH_PROMPT + text) raw = resp.text try: result = json.loads(raw) except Exception: result = repair_json_truncated(raw) if isinstance(result, list): return [c for c in result if isinstance(c, dict)] elif isinstance(result, dict): return [result] return [] except Exception as e: err = str(e).lower() if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]): delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60) time.sleep(delay) else: log.warning(f" Non-transient error: {e}") break return None # failed def get_window_text(doc_hash, window_filename): """Reconstruct window text from page files.""" # Window filename: window_NNNN.json -> window index is NNNN try: w_idx = int(Path(window_filename).stem.split('_')[1]) - 1 except (IndexError, ValueError): return None text_path = TEXT_DIR / doc_hash if not text_path.exists(): return None page_files = sorted([ f for f in text_path.iterdir() if f.name.startswith('page_') and f.name.endswith('.txt') ]) if not page_files: return None # Re-derive which pages this window covered (window_size=5 from config) window_size = 5 start = w_idx * window_size window_pages = page_files[start:start + window_size] if not window_pages: return None parts = [] for j, pf in enumerate(window_pages): try: text = pf.read_text(encoding='utf-8') parts.append(f"--- Page {start + j + 1} ---\n{text}") except Exception: pass return "\n\n".join(parts) if parts else None def repair_file(corrupted_path, key_rotator, dry_run): """Attempt to repair a single corrupted window file.""" path = Path(corrupted_path) # Sanity check -- maybe it fixed itself somehow try: with open(path) as f: existing = json.load(f) return "already_valid" except Exception: pass # Extract doc hash and window name from path structure # Expected: /opt/recon/data/concepts/{hash}/window_NNNN.json doc_hash = path.parent.name window_filename = path.name # Get source text for this window window_text = get_window_text(doc_hash, window_filename) if not window_text: return "no_source_text" if dry_run: return "would_repair" # Re-enrich from source text key = key_rotator.next() concepts = enrich_window_text(window_text, key) if concepts is None: return "enrichment_failed" # Tag concepts with metadata try: w_idx = int(Path(window_filename).stem.split('_')[1]) - 1 window_size = 5 start_page = w_idx * window_size + 1 except Exception: w_idx = 0 start_page = 0 for c in concepts: c['_window'] = w_idx + 1 c['_start_page'] = start_page c['_doc_hash'] = doc_hash c['_repaired'] = True # Write repaired file try: with open(path, 'w', encoding='utf-8') as f: json.dump(concepts, f, indent=2, ensure_ascii=False) return "repaired" except Exception as e: return "write_error" def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") parser.add_argument("--workers", type=int, default=8) args = parser.parse_args() if not CORRUPTED_LIST.exists(): log.error(f"Corrupted list not found: {CORRUPTED_LIST}") log.error("Run Task 1 first to generate it.") return keys = load_gemini_keys() rotator = KeyRotator(keys) corrupted = [] with open(CORRUPTED_LIST) as f: for line in f: parts = line.strip().split('\t') if parts: corrupted.append(parts[0]) log.info(f"Repairing {len(corrupted):,} corrupted window files") log.info(f"Dry run: {args.dry_run} | Workers: {args.workers} | Keys: {len(keys)}") results = defaultdict(int) unrecoverable = [] lock = threading.Lock() with ThreadPoolExecutor(max_workers=args.workers) as ex: futures = {ex.submit(repair_file, p, rotator, args.dry_run): p for p in corrupted} done = 0 for future in as_completed(futures): path = futures[future] status = future.result() with lock: results[status] += 1 if status in ("no_source_text", "enrichment_failed", "write_error"): unrecoverable.append((path, status)) done += 1 if done % 100 == 0: log.info(f" {done:,}/{len(corrupted):,} | {dict(results)}") time.sleep(0.05) log.info("── Results ─────────────────────────────────────────────────") for status, count in sorted(results.items(), key=lambda x: -x[1]): log.info(f" {status:<25} {count:>8,}") if unrecoverable: with open(UNRECOVERABLE_LOG, 'w') as f: for path, reason in unrecoverable: f.write(f"{path}\t{reason}\n") log.info(f"\n Unrecoverable: {len(unrecoverable)} — logged to {UNRECOVERABLE_LOG}") else: log.info("\n All files repaired successfully.") if __name__ == "__main__": main()