Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 14:44:54 +02:00 · 2026-04-14 14:57:23 +00:00 · 2026-04-14 14:57:23 +00:00 · 563c16bb71
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions
--- a/scripts/domain_reenrich.py
+++ b/scripts/domain_reenrich.py
@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+domain_reenrich.py — Re-enriches solo-Reference concepts that domain_remap.py
+couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
+with a lightweight classification-only prompt, updates domain in-place.
+
+Usage:
+  python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
+
+Reads:  /opt/recon/data/remap_unknowns.jsonl
+Writes: domain field in-place in window JSON files
+Log:    /opt/recon/logs/domain_reenrich.log
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+
+UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
+LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(),
+    ]
+)
+log = logging.getLogger("domain_reenrich")
+
+CANONICAL_DOMAINS = [
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+]
+
+DOMAIN_SET = set(CANONICAL_DOMAINS)
+
+CLASSIFY_PROMPT = """\
+Classify this knowledge concept into one or more domains.
+
+VALID DOMAINS (use ONLY these exact strings, no others):
+{domains}
+
+Concept title: {title}
+Concept tags: {subdomain}
+Concept preview: {content}
+
+Return ONLY valid JSON, no markdown, no explanation:
+{{"domain": ["Domain Name"]}}
+
+Rules:
+- Use only the domain strings listed above, spelled exactly
+- If genuinely multi-domain assign all that apply
+- Never return empty domain list — pick the closest match
+- Medical content, herbs, first aid, veterinary → Medical
+- Food growing, foraging, hunting, livestock → Sustainment Systems
+- Food preservation, canning, storage → Food Systems
+- Solar, wind, batteries, generators → Power Systems
+- Water sourcing, filtration, sanitation → Water Systems
+"""
+
+def load_gemini_keys():
+    env = Path("/opt/recon/.env")
+    keys = []
+    for line in env.read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def classify_concept(title, subdomains, content, key):
+    prompt = CLASSIFY_PROMPT.format(
+        domains="\n".join(f"  {d}" for d in CANONICAL_DOMAINS),
+        title=title or "(untitled)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=content[:300] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for attempt in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
+            if domains:
+                return domains
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
+                time.sleep(delay)
+            else:
+                break
+    return ["Foundational Skills"]  # last-resort fallback
+
+def process_unknown(item, key_rotator):
+    filepath = Path(item["filepath"])
+    title = item.get("title", "")
+    subdomains = item.get("subdomain", [])
+    content = item.get("content_preview", "")
+
+    if not filepath.exists():
+        return "file_missing"
+
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            concepts = json.load(f)
+    except Exception:
+        return "read_error"
+
+    if not isinstance(concepts, list):
+        return "not_list"
+
+    # Find this concept by title and update its domain
+    matched = False
+    for concept in concepts:
+        if not isinstance(concept, dict):
+            continue
+        if concept.get("title", "") == title:
+            raw = concept.get("domain", [])
+            if isinstance(raw, str):
+                raw = [raw]
+            # Only re-enrich if still stuck on Reference
+            if raw == ["Reference"] or raw == []:
+                key = key_rotator.next()
+                new_domains = classify_concept(title, subdomains, content, key)
+                concept["domain"] = new_domains
+                concept["_reenriched"] = True
+                matched = True
+                break
+
+    if not matched:
+        return "already_fixed"
+
+    try:
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(concepts, f, indent=2, ensure_ascii=False)
+    except Exception:
+        return "write_error"
+
+    return "ok"
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    keys = load_gemini_keys()
+    if not keys:
+        log.error("No Gemini keys found in .env")
+        return
+    rotator = KeyRotator(keys)
+
+    unknowns = []
+    with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                unknowns.append(json.loads(line))
+
+    if args.limit:
+        unknowns = unknowns[:args.limit]
+
+    total = len(unknowns)
+    log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
+    log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
+
+    results = defaultdict(int)
+    lock = threading.Lock()
+    done = 0
+
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
+        for future in as_completed(futures):
+            status = future.result()
+            with lock:
+                results[status] += 1
+                done += 1
+                if done % 5000 == 0:
+                    pct = done / total * 100
+                    log.info(f"  Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
+            time.sleep(0.05)
+
+    log.info("── Final Results ─────────────────────────────────────────────")
+    for status, count in sorted(results.items(), key=lambda x: -x[1]):
+        log.info(f"  {status:<25} {count:>10,}")
+    log.info(f"  Total: {total:,}")
+
+if __name__ == "__main__":
+    main()