Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-14 14:57:23 +00:00 · 2026-04-14 14:57:23 +00:00 · 563c16bb71
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/aa_download.py
+++ b/scripts/aa_download.py
@ -0,0 +1,373 @@
+#!/usr/bin/env python3
+"""
+aa_download.py — Anna's Archive bulk downloader for RECON library acquisition.
+
+For each target book:
+  1. Searches annas-archive.org for the title + author
+  2. Extracts the best PDF match (verified by author/page count)
+  3. Gets the MD5 from the book page
+  4. Attempts download from Libgen mirrors in order
+  5. Verifies downloaded file is a valid PDF
+  6. Writes full acquisition report
+
+Usage:
+  python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]
+
+Report output: ~/projects/recon/aa_acquisition_report.md
+"""
+
+import json
+import time
+import random
+import hashlib
+import logging
+import argparse
+from pathlib import Path
+from datetime import datetime
+
+import requests
+from bs4 import BeautifulSoup
+
+REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
+LOG_FILE    = Path("/opt/recon/logs/aa_download.log")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("aa_download")
+
+SESSION = requests.Session()
+SESSION.headers.update({
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
+    "Accept-Language": "en-US,en;q=0.9",
+})
+
+BASE_AA = "https://annas-archive.gl"
+
+# Download attempt order — try fastest mirrors first
+LIBGEN_MIRRORS = [
+    "https://libgen.is/get.php?md5={md5}",
+    "https://libgen.rs/get.php?md5={md5}",
+    "https://libgen.st/get.php?md5={md5}",
+    "https://libgen.li/ads.php?md5={md5}",
+]
+
+# ── Target book list ──────────────────────────────────────────────────────────
+TARGETS = [
+    # (title, author, dest_dir)
+
+    # Medical — Herbalism
+    ("Medical Herbalism",                          "David Hoffmann",             "Medical/Herbalism"),
+    ("Making Plant Medicine",                      "Richo Cech",                 "Medical/Herbalism"),
+    ("The Earthwise Herbal Volume 1",              "Matthew Wood",               "Medical/Herbalism"),
+    ("The Earthwise Herbal Volume 2",              "Matthew Wood",               "Medical/Herbalism"),
+    ("Herbal Antibiotics",                         "Stephen Buhner",             "Medical/Herbalism"),
+    ("Herbal Antivirals",                          "Stephen Buhner",             "Medical/Herbalism"),
+    ("The Herbal Medicine-Maker's Handbook",       "James Green",                "Medical/Herbalism"),
+    ("Rosemary Gladstar's Medicinal Herbs",        "Rosemary Gladstar",          "Medical/Herbalism"),
+
+    # Medical — Austere
+    ("Wilderness Medicine",                        "Paul Auerbach",              "Medical/Austere"),
+    ("Medicine for Mountaineering",                "James Wilkerson",            "Medical/Austere"),
+
+    # Medical — Veterinary
+    ("The Chicken Health Handbook",                "Gail Damerow",               "Medical/Veterinary"),
+    ("Goat Husbandry",                             "David Mackenzie",            "Medical/Veterinary"),
+
+    # Power Systems
+    ("The Renewable Energy Handbook",              "William Kemp",               "Power"),
+    ("Homebrew Wind Power",                        "Dan Bartmann",               "Power"),
+    ("Wind Energy Basics",                         "Paul Gipe",                  "Power"),
+    ("12-Volt Bible",                              "Brotherton",                 "Power"),
+    ("Wiring a House",                             "Rex Cauldwell",              "Power"),
+
+    # Navigation
+    ("Wilderness Navigation",                      "Bob Burns",                  "Navigation"),
+    ("Be Expert with Map and Compass",             "Bjorn Kjellstrom",           "Navigation"),
+    ("Emergency Navigation",                       "David Burch",                "Navigation"),
+    ("The Natural Navigator",                      "Tristan Gooley",             "Navigation"),
+    ("The Essential Wilderness Navigator",         "David Seidman",              "Navigation"),
+
+    # Water Systems
+    ("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster",            "Water"),
+    ("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster",            "Water"),
+    ("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster",            "Water"),
+    ("Water Storage",                              "Art Ludwig",                 "Water"),
+    ("The Home Water Supply",                      "Stu Campbell",               "Water"),
+
+    # Food Systems
+    ("The Art of Fermentation",                    "Sandor Katz",                "Food"),
+    ("Fermented Vegetables",                       "Kirsten Shockey",            "Food"),
+    ("Mastering Artisan Cheesemaking",             "Gianaclis Caldwell",         "Food"),
+    ("Home Cheese Making",                         "Ricki Carroll",              "Food"),
+    ("The Art of Natural Cheesemaking",            "David Asher",                "Food"),
+
+    # Permaculture
+    ("Edible Forest Gardens Volume 1",             "Dave Jacke",                 "Permaculture"),
+    ("Edible Forest Gardens Volume 2",             "Dave Jacke",                 "Permaculture"),
+    ("Creating a Forest Garden",                   "Martin Crawford",            "Permaculture"),
+    ("Sepp Holzer's Permaculture",                 "Sepp Holzer",                "Permaculture"),
+    ("The Permaculture Handbook",                  "Peter Bane",                 "Permaculture"),
+    ("The Market Gardener",                        "Jean-Martin Fortier",        "Permaculture"),
+
+    # Scenario / Emergency
+    ("SAS Survival Handbook",                      "John Wiseman",               "Scenario"),
+    ("Pocket Ref",                                 "Thomas Glover",              "Scenario"),
+    ("Deep Survival",                              "Laurence Gonzales",          "Scenario"),
+
+    # Foundational Skills
+    ("Back to Basics",                             "Reader's Digest",            "Skills"),
+    ("A Pattern Language",                         "Christopher Alexander",      "Skills"),
+]
+
+BASE_LIB = Path("/mnt/library/Acquired")
+
+
+def search_aa(title, author):
+    """Search Anna's Archive and return list of candidate result dicts."""
+    query = f"{title} {author}"
+    url = f"{BASE_AA}/search"
+    params = {"q": query, "ext": "pdf", "lang": "en"}
+    try:
+        r = SESSION.get(url, params=params, timeout=20)
+        r.raise_for_status()
+    except Exception as e:
+        log.warning(f"Search failed for '{title}': {e}")
+        return []
+
+    soup = BeautifulSoup(r.text, "html.parser")
+    results = []
+
+    seen_md5 = set()
+    for item in soup.select("a[href^='/md5/']"):
+        href = item.get("href", "")
+        md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
+        if not md5 or len(md5) != 32:
+            continue
+        text = item.get_text(" ", strip=True)
+        if not text or md5 in seen_md5:
+            continue
+        seen_md5.add(md5)
+        results.append({"md5": md5, "text": text, "href": href})
+        if len(results) >= 5:
+            break
+
+    return results
+
+
+def get_book_details(md5):
+    """Fetch the book detail page and extract useful metadata."""
+    url = f"{BASE_AA}/md5/{md5}"
+    try:
+        r = SESSION.get(url, timeout=20)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        text = soup.get_text(" ", strip=True)
+        # Extract page count if visible
+        pages = None
+        for word in text.split():
+            if word.isdigit() and 50 < int(word) < 5000:
+                pages = int(word)
+                break
+        return {"pages": pages, "text": text[:500]}
+    except Exception as e:
+        log.warning(f"Detail fetch failed for md5={md5}: {e}")
+        return {}
+
+
+def try_download(md5, dest_path):
+    """Try each libgen mirror until one works. Returns True on success."""
+    for mirror_tpl in LIBGEN_MIRRORS:
+        url = mirror_tpl.format(md5=md5)
+        try:
+            r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
+            content_type = r.headers.get("content-type", "")
+            if r.status_code != 200:
+                continue
+            # Some mirrors return an HTML ads page before the real file
+            if "text/html" in content_type:
+                # Parse redirect link from ads page
+                soup = BeautifulSoup(r.text, "html.parser")
+                dl_link = soup.select_one("a[href*='.pdf']")
+                if not dl_link:
+                    dl_link = soup.select_one("a[href*='get.php']")
+                if not dl_link:
+                    continue
+                actual_url = dl_link["href"]
+                if not actual_url.startswith("http"):
+                    actual_url = f"https://libgen.is{actual_url}"
+                r = SESSION.get(actual_url, timeout=120, stream=True)
+                if r.status_code != 200:
+                    continue
+
+            # Stream to disk
+            dest_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(dest_path, "wb") as f:
+                for chunk in r.iter_content(8192):
+                    f.write(chunk)
+
+            # Verify it's a real PDF
+            with open(dest_path, "rb") as f:
+                header = f.read(4)
+            if header == b"%PDF":
+                size_mb = dest_path.stat().st_size / 1024 / 1024
+                log.info(f"  [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
+                return True
+            else:
+                log.warning(f"  [BAD] Not a PDF from {url}")
+                dest_path.unlink(missing_ok=True)
+
+        except Exception as e:
+            log.warning(f"  Mirror failed {url}: {e}")
+            continue
+
+    return False
+
+
+def process_book(title, author, subdir, dry_run):
+    """Full search + download pipeline for one book."""
+    log.info(f"[SEARCH] '{title}' — {author}")
+    result = {
+        "title": title,
+        "author": author,
+        "status": "NOT FOUND",
+        "md5": "",
+        "pages": "",
+        "file": "",
+        "notes": "",
+    }
+
+    candidates = search_aa(title, author)
+    if not candidates:
+        result["notes"] = "No results from AA search"
+        return result
+
+    # Pick best candidate — prefer one whose text contains author name
+    best = None
+    for c in candidates:
+        if author.split()[-1].lower() in c["text"].lower():
+            best = c
+            break
+    if not best:
+        best = candidates[0]  # take first result if no author match
+
+    md5 = best["md5"]
+    result["md5"] = md5
+
+    details = get_book_details(md5)
+    result["pages"] = details.get("pages", "")
+
+    if dry_run:
+        result["status"] = "DRY RUN — found"
+        result["notes"] = f"MD5: {md5}"
+        return result
+
+    # Build destination path
+    safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
+    safe_author = author.split()[-1]
+    filename = f"{safe_title}_{safe_author}.pdf"
+    dest = BASE_LIB / subdir / filename
+
+    if dest.exists():
+        result["status"] = "ALREADY EXISTS"
+        result["file"] = str(dest)
+        return result
+
+    log.info(f"  MD5: {md5} — attempting download...")
+    ok = try_download(md5, dest)
+
+    if ok:
+        result["status"] = "DOWNLOADED"
+        result["file"] = str(dest)
+    else:
+        result["status"] = "MD5 ONLY"
+        result["notes"] = f"All mirrors failed. MD5: {md5}"
+
+    return result
+
+
+def write_report(results):
+    REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    downloaded   = [r for r in results if r["status"] == "DOWNLOADED"]
+    md5_only     = [r for r in results if r["status"] == "MD5 ONLY"]
+    not_found    = [r for r in results if r["status"] == "NOT FOUND"]
+    already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]
+
+    lines = [
+        f"# Anna's Archive Acquisition Report",
+        f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+        f"**Total searched:** {len(results)}",
+        f"",
+        f"| Status | Count |",
+        f"|--------|-------|",
+        f"| Downloaded | {len(downloaded)} |",
+        f"| MD5 only (mirrors failed) | {len(md5_only)} |",
+        f"| Not found on AA | {len(not_found)} |",
+        f"| Already in library | {len(already_have)} |",
+        f"",
+    ]
+
+    if downloaded:
+        lines += ["## Downloaded", ""]
+        lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
+        for r in downloaded:
+            lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
+        lines.append("")
+
+    if md5_only:
+        lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
+        lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
+        for r in md5_only:
+            lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
+        lines.append("")
+
+    if not_found:
+        lines += ["## Not Found on Anna's Archive", ""]
+        lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
+        for r in not_found:
+            lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
+        lines.append("")
+
+    if already_have:
+        lines += ["## Already in Library", ""]
+        lines += ["| Title | Author |", "|-------|--------|"]
+        for r in already_have:
+            lines.append(f"| {r['title']} | {r['author']} |")
+        lines.append("")
+
+    REPORT_PATH.write_text("\n".join(lines))
+    log.info(f"Report written to {REPORT_PATH}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    targets = TARGETS[:args.limit] if args.limit else TARGETS
+    log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")
+
+    results = []
+    for i, (title, author, subdir) in enumerate(targets, 1):
+        log.info(f"[{i}/{len(targets)}]")
+        result = process_book(title, author, subdir, args.dry_run)
+        results.append(result)
+        log.info(f"  -> {result['status']}")
+        # Polite delay between requests
+        time.sleep(random.uniform(8, 15))
+
+    write_report(results)
+
+    print(f"\n-- Summary -----------------------------------------------")
+    for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
+        count = sum(1 for r in results if r["status"] == status)
+        if count:
+            print(f"  {status:<35} {count:>3}")
+    print(f"  Report: {REPORT_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/aa_download_pass2.py
+++ b/scripts/aa_download_pass2.py
@ -0,0 +1,478 @@
+#!/usr/bin/env python3
+"""
+aa_download_pass2.py — Second-pass downloader for books that failed in pass 1.
+
+Reads the MD5 list from pass 1 report and tries:
+  1. Z-Library search by title/author (separate catalog from Libgen)
+  2. IPFS gateways using AA's IPFS CID (different from MD5 but findable)
+  3. Alternative Libgen mirrors not tried in pass 1
+  4. Direct AA slow download with longer timeout + retry
+
+Checkpoint: saves progress to /opt/recon/data/aa_pass2_checkpoint.json
+  so interrupted runs resume where they left off.
+
+Usage:
+  python3 /opt/recon/scripts/aa_download_pass2.py [--dry-run]
+"""
+
+import json
+import time
+import random
+import logging
+import hashlib
+import argparse
+from pathlib import Path
+from datetime import datetime
+
+import requests
+from bs4 import BeautifulSoup
+
+LOG_FILE       = Path("/opt/recon/logs/aa_download_pass2.log")
+REPORT_IN      = Path.home() / "projects/recon/aa_acquisition_report.md"
+REPORT_OUT     = Path.home() / "projects/recon/aa_acquisition_report_pass2.md"
+CHECKPOINT     = Path("/opt/recon/data/aa_pass2_checkpoint.json")
+BASE_LIB       = Path("/mnt/library/Acquired")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("aa_pass2")
+
+SESSION = requests.Session()
+SESSION.headers.update({
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
+    "Accept-Language": "en-US,en;q=0.9",
+})
+
+# ── Mirrors to try in order ───────────────────────────────────────────────────
+MIRRORS = [
+    # Libgen alternatives
+    "https://libgen.li/ads.php?md5={md5}",
+    "https://library.lol/main/{md5}",
+    "https://libgen.rocks/get.php?md5={md5}",
+    # Z-Library direct MD5 endpoint (sometimes works)
+    "https://z-library.se/md5/{md5}",
+    # IPFS public gateways — AA uses IPFS for storage
+    "https://cloudflare-ipfs.com/ipfs/{md5}",
+    "https://ipfs.io/ipfs/{md5}",
+    "https://gateway.pinata.cloud/ipfs/{md5}",
+]
+
+# ── Books that failed in pass 1 — title, author, md5, subdir ─────────────────
+PASS1_FAILURES = [
+    # Medical/Herbalism
+    ("The Earthwise Herbal Volume 1",         "Matthew Wood",         "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
+    ("The Earthwise Herbal Volume 2",         "Matthew Wood",         "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
+    ("Herbal Antibiotics",                    "Stephen Buhner",       "5839dab78edfdff0d7986fac62b814da", "Medical/Herbalism"),
+    ("The Herbal Medicine-Maker's Handbook",  "James Green",          "27e8e8a3585705ed194029b69c7d61b1", "Medical/Herbalism"),
+    ("Rosemary Gladstar's Medicinal Herbs",   "Rosemary Gladstar",    "9b1966f20a32ab4331bfece167be1dd0", "Medical/Herbalism"),
+
+    # Medical/Austere
+    ("Wilderness Medicine",                   "Paul Auerbach",        "957818eaa4ec40527bb05902f9ef7c51", "Medical/Austere"),
+    ("Medicine for Mountaineering",           "James Wilkerson",      "39cb07998f2034206f0c9472e44cb0b4", "Medical/Austere"),
+
+    # Medical/Veterinary
+    ("The Chicken Health Handbook",           "Gail Damerow",         "0ba42fbea034b9a08ec8e2f8d7606efe", "Medical/Veterinary"),
+
+    # Power
+    ("The Renewable Energy Handbook",         "William Kemp",         "475d89fa80aea6c45aa4b1b4b9c5e274", "Power"),
+    ("Homebrew Wind Power",                   "Dan Bartmann",         "0578696d5b1b6bceb3e5e3302c1a31aa", "Power"),
+    ("Wind Energy Basics",                    "Paul Gipe",            "ccbe9d22e0a5e32d61921d20d66a8e05", "Power"),
+    ("12-Volt Bible",                         "Brotherton",           "3f964fa6d730fdf2c3d3e231e87cf692", "Power"),
+    ("Wiring a House",                        "Rex Cauldwell",        "5efcb53450e9eb560210eee40678adcf", "Power"),
+
+    # Navigation
+    ("Emergency Navigation",                  "David Burch",          "25e4def9e777b3fa9ca935134732ff9d", "Navigation"),
+
+    # Water
+    ("Water Storage",                         "Art Ludwig",           "17c965ec15c6cf4f09b5377b599a5266", "Water"),
+    ("The Home Water Supply",                 "Stu Campbell",         "9b22677d2f8e8b39f7a6bf032187295b", "Water"),
+
+    # Food
+    ("Fermented Vegetables",                  "Kirsten Shockey",      "74d3bde876b4c17be66c21fdfa85213e", "Food"),
+    ("The Art of Natural Cheesemaking",       "David Asher",          "bc0e0829d701fea9beca912d39f8cc74", "Food"),
+
+    # Permaculture
+    ("Edible Forest Gardens Volume 1",        "Dave Jacke",           "6b069c3bb077fdd89d487a363c070fbb", "Permaculture"),
+    ("Edible Forest Gardens Volume 2",        "Dave Jacke",           "699255bfde7f69285c132a94ec291bf4", "Permaculture"),
+    ("Creating a Forest Garden",              "Martin Crawford",      "96d71d70dba31ae86e14845f913e557e", "Permaculture"),
+    ("Sepp Holzer's Permaculture",            "Sepp Holzer",          "32be55a9fce3e31cacd6912069abb410", "Permaculture"),
+    ("The Permaculture Handbook",             "Peter Bane",           "08cb4492739fda4d01b5a868a408e4a0", "Permaculture"),
+    ("The Market Gardener",                   "Jean-Martin Fortier",  "ac69f6c8c22305b42b539482dc761c19", "Permaculture"),
+
+    # Scenario
+    ("SAS Survival Handbook",                 "John Wiseman",         "fa967fd5fcbeb3c9887e22f73e590c64", "Scenario"),
+    ("Pocket Ref",                            "Thomas Glover",        "8e4988ce513a4aa75e7e6c00ee36692b", "Scenario"),
+    ("Deep Survival",                         "Laurence Gonzales",    "9a907ab13b81ea597407fffdb8ea1b04", "Scenario"),
+
+    # Skills
+    ("A Pattern Language",                    "Christopher Alexander","7f5cc06b5399b65a278c4005ccd8d476", "Skills"),
+]
+
+
+def load_checkpoint():
+    """Load checkpoint: dict of {title: result_dict} for completed books."""
+    if CHECKPOINT.exists():
+        try:
+            return json.loads(CHECKPOINT.read_text())
+        except Exception:
+            pass
+    return {}
+
+
+def save_checkpoint(completed):
+    """Save checkpoint after each book."""
+    CHECKPOINT.parent.mkdir(parents=True, exist_ok=True)
+    tmp = str(CHECKPOINT) + ".tmp"
+    with open(tmp, "w") as f:
+        json.dump(completed, f, indent=2)
+    Path(tmp).replace(CHECKPOINT)
+
+
+def load_md5s_from_report():
+    """Parse MD5 hashes from pass 1 report to pre-populate PASS1_FAILURES."""
+    if not REPORT_IN.exists():
+        return {}
+    md5_map = {}
+    for line in REPORT_IN.read_text().splitlines():
+        if "`" in line and len(line) > 30:
+            parts = line.split("|")
+            if len(parts) >= 4:
+                title = parts[1].strip()
+                md5_cell = parts[3].strip().strip("`")
+                if len(md5_cell) == 32 and md5_cell.isalnum():
+                    md5_map[title.lower()] = md5_cell
+    return md5_map
+
+
+def search_zlib(title, author):
+    """Try Z-Library search endpoint."""
+    try:
+        url = "https://z-library.se/s/"
+        params = {"q": f"{title} {author}", "extension[]": "pdf"}
+        r = SESSION.get(url, params=params, timeout=15)
+        if r.status_code != 200:
+            return None
+        soup = BeautifulSoup(r.text, "html.parser")
+        # Z-lib book links contain /book/
+        for a in soup.select("a[href*='/book/']")[:3]:
+            href = a.get("href", "")
+            if href:
+                book_url = f"https://z-library.se{href}" if href.startswith("/") else href
+                return book_url
+    except Exception as e:
+        log.debug(f"Zlib search failed: {e}")
+    return None
+
+
+def try_zlib_download(book_url, dest_path):
+    """Download from Z-Library book page."""
+    try:
+        r = SESSION.get(book_url, timeout=15)
+        soup = BeautifulSoup(r.text, "html.parser")
+        dl = soup.select_one("a.addDownloadedBook, a[href*='/dl/'], a.btn-primary[href*='download']")
+        if not dl:
+            return False
+        dl_url = dl["href"]
+        if not dl_url.startswith("http"):
+            dl_url = f"https://z-library.se{dl_url}"
+        r2 = SESSION.get(dl_url, timeout=120, stream=True)
+        if r2.status_code != 200:
+            return False
+        dest_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(dest_path, "wb") as f:
+            for chunk in r2.iter_content(8192):
+                f.write(chunk)
+        with open(dest_path, "rb") as f:
+            if f.read(4) == b"%PDF":
+                return True
+        dest_path.unlink(missing_ok=True)
+    except Exception as e:
+        log.debug(f"Zlib download failed: {e}")
+    return False
+
+
+def try_mirrors(md5, dest_path):
+    """Try all mirrors with the MD5."""
+    import re as _re
+    for tpl in MIRRORS:
+        url = tpl.format(md5=md5)
+        try:
+            r = SESSION.get(url, timeout=20, stream=True, allow_redirects=True)
+            if r.status_code != 200:
+                continue
+            ctype = r.headers.get("content-type", "")
+            if "html" in ctype:
+                soup = BeautifulSoup(r.text, "html.parser")
+                # For libgen.li ads page, look for get.php with key
+                dl = None
+                match = _re.search(r'href="(get\.php\?md5=[^"]+)"', r.text)
+                if match:
+                    actual = f"https://libgen.li/{match.group(1)}"
+                else:
+                    dl = (soup.select_one("a[href*='.pdf']") or
+                          soup.select_one("a[href*='get.php']") or
+                          soup.select_one("a[href*='/get/']"))
+                    if not dl:
+                        continue
+                    actual = dl["href"]
+                    if not actual.startswith("http"):
+                        base = url.split("/")[0] + "//" + url.split("/")[2]
+                        actual = base + ("/" if not actual.startswith("/") else "") + actual
+
+                r = SESSION.get(actual, timeout=60, stream=True)
+                if r.status_code != 200:
+                    continue
+
+            dest_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(dest_path, "wb") as f:
+                for chunk in r.iter_content(8192):
+                    f.write(chunk)
+            with open(dest_path, "rb") as f:
+                if f.read(4) == b"%PDF":
+                    size_mb = dest_path.stat().st_size / 1024 / 1024
+                    log.info(f"    [OK] {size_mb:.1f}MB via {url}")
+                    return True
+            dest_path.unlink(missing_ok=True)
+        except Exception as e:
+            log.debug(f"Mirror {url} failed: {e}")
+        time.sleep(2)
+    return False
+
+
+def get_ipfs_cids(md5):
+    """Fetch IPFS CIDs from AA book detail page."""
+    import re as _re
+    cids = []
+    try:
+        r = SESSION.get(f"https://annas-archive.gl/md5/{md5}", timeout=20)
+        if r.status_code == 200:
+            for m in _re.finditer(r'ipfs_cid[:\s]+([A-Za-z0-9]{46,})', r.text):
+                cids.append(m.group(1))
+            # Also check for CIDs in href attributes
+            for m in _re.finditer(r'ipfs://([A-Za-z0-9]{46,})', r.text):
+                if m.group(1) not in cids:
+                    cids.append(m.group(1))
+    except Exception as e:
+        log.debug(f"IPFS CID fetch failed: {e}")
+    return cids
+
+
+def try_ipfs_download(cids, dest_path):
+    """Try downloading via IPFS public gateways."""
+    gateways = [
+        "https://cloudflare-ipfs.com/ipfs/{}",
+        "https://dweb.link/ipfs/{}",
+    ]
+    for cid in cids[:3]:  # limit to first 3 CIDs
+        for gw_tpl in gateways:
+            url = gw_tpl.format(cid)
+            try:
+                r = SESSION.get(url, timeout=15, stream=True)
+                if r.status_code != 200:
+                    continue
+                dest_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(dest_path, "wb") as f:
+                    for chunk in r.iter_content(8192):
+                        f.write(chunk)
+                with open(dest_path, "rb") as f:
+                    if f.read(4) == b"%PDF":
+                        size_mb = dest_path.stat().st_size / 1024 / 1024
+                        log.info(f"    [OK] {size_mb:.1f}MB via IPFS {url[:60]}...")
+                        return True
+                dest_path.unlink(missing_ok=True)
+            except Exception as e:
+                log.debug(f"IPFS {url} failed: {e}")
+            time.sleep(1)
+    return False
+
+
+def search_aa_fresh(title, author):
+    """Fresh AA search on .gl domain for books that weren't found before."""
+    for domain in ["annas-archive.gl", "annas-archive.se", "annas-archive.org"]:
+        try:
+            url = f"https://{domain}/search"
+            params = {"q": f"{title} {author}", "ext": "pdf", "lang": "en"}
+            r = SESSION.get(url, params=params, timeout=15)
+            if r.status_code != 200:
+                continue
+            soup = BeautifulSoup(r.text, "html.parser")
+            for a in soup.select("a[href^='/md5/']"):
+                text = a.get_text(" ", strip=True)
+                if not text:
+                    continue
+                md5 = a["href"].split("/md5/")[-1].split("/")[0].strip()
+                if len(md5) == 32:
+                    if author.split()[-1].lower() in text.lower() or title.split()[0].lower() in text.lower():
+                        return md5
+        except Exception:
+            continue
+    return None
+
+
+def process_book(title, author, md5_hint, subdir, dry_run):
+    result = {
+        "title": title, "author": author,
+        "status": "NOT FOUND", "md5": md5_hint,
+        "file": "", "notes": "",
+    }
+
+    safe_title  = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
+    safe_author = author.split()[-1]
+    dest = BASE_LIB / subdir / f"{safe_title}_{safe_author}.pdf"
+
+    if dest.exists():
+        result["status"] = "ALREADY EXISTS"
+        result["file"] = str(dest)
+        return result
+
+    if dry_run:
+        result["status"] = "DRY RUN"
+        return result
+
+    # 1. Try Z-Library first (different catalog)
+    log.info(f"  Trying Z-Library...")
+    zlib_url = search_zlib(title, author)
+    if zlib_url:
+        if try_zlib_download(zlib_url, dest):
+            result["status"] = "DOWNLOADED (Z-Library)"
+            result["file"] = str(dest)
+            return result
+
+    # 2. If no MD5 from pass 1, do a fresh AA search
+    md5 = md5_hint
+    if not md5:
+        log.info(f"  Searching AA for fresh MD5...")
+        md5 = search_aa_fresh(title, author)
+        if md5:
+            result["md5"] = md5
+            log.info(f"  Found MD5: {md5}")
+
+    # 3. Try IPFS with real CIDs from AA detail page
+    if md5:
+        log.info(f"  Fetching IPFS CIDs from AA...")
+        cids = get_ipfs_cids(md5)
+        if cids:
+            log.info(f"  Found {len(cids)} IPFS CID(s), trying gateways...")
+            if try_ipfs_download(cids, dest):
+                result["status"] = "DOWNLOADED (IPFS)"
+                result["file"] = str(dest)
+                return result
+
+    # 4. Try all mirrors with MD5
+    if md5:
+        log.info(f"  Trying mirrors with MD5 {md5}...")
+        if try_mirrors(md5, dest):
+            result["status"] = "DOWNLOADED (mirror)"
+            result["file"] = str(dest)
+            return result
+        result["status"] = "MD5 ONLY"
+        result["notes"] = f"MD5 confirmed, all mirrors failed: {md5}"
+    else:
+        result["notes"] = "Not found on AA or Z-Library"
+
+    return result
+
+
+def write_report(results):
+    downloaded = [r for r in results if "DOWNLOADED" in r["status"]]
+    md5_only   = [r for r in results if r["status"] == "MD5 ONLY"]
+    not_found  = [r for r in results if r["status"] == "NOT FOUND"]
+    existing   = [r for r in results if r["status"] == "ALREADY EXISTS"]
+
+    lines = [
+        "# AA Acquisition Report -- Pass 2",
+        f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
+        f"**Searched:** {len(results)} | **Downloaded:** {len(downloaded)} | "
+        f"**MD5 only:** {len(md5_only)} | **Not found:** {len(not_found)}",
+        "",
+    ]
+    if downloaded:
+        lines += ["## Downloaded", "",
+                  "| Title | Author | Via | File |",
+                  "|-------|--------|-----|------|"]
+        for r in downloaded:
+            lines.append(f"| {r['title']} | {r['author']} | {r['status']} | `{Path(r['file']).name}` |")
+        lines.append("")
+
+    if existing:
+        lines += ["## Already in Library", "",
+                  "| Title | Author |",
+                  "|-------|--------|"]
+        for r in existing:
+            lines.append(f"| {r['title']} | {r['author']} |")
+        lines.append("")
+
+    if md5_only:
+        lines += ["## MD5 Known -- All Mirrors Failed", "",
+                  "| Title | Author | MD5 |",
+                  "|-------|--------|-----|"]
+        for r in md5_only:
+            lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` |")
+        lines.append("")
+
+    if not_found:
+        lines += ["## Not Found Anywhere", "",
+                  "| Title | Author | Notes |",
+                  "|-------|--------|-------|"]
+        for r in not_found:
+            lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
+        lines.append("")
+
+    REPORT_OUT.parent.mkdir(parents=True, exist_ok=True)
+    REPORT_OUT.write_text("\n".join(lines))
+    log.info(f"Report written to {REPORT_OUT}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    # Load any MD5s captured in pass 1
+    md5_map = load_md5s_from_report()
+    targets = []
+    for title, author, md5_hint, subdir in PASS1_FAILURES:
+        md5 = md5_hint or md5_map.get(title.lower(), "")
+        targets.append((title, author, md5, subdir))
+
+    # Load checkpoint
+    completed = load_checkpoint()
+    if completed:
+        log.info(f"Resuming: {len(completed)} books already processed in previous run")
+
+    log.info(f"Pass 2: {len(targets)} books | dry_run={args.dry_run}")
+    results = []
+    for i, (title, author, md5, subdir) in enumerate(targets, 1):
+        # Check checkpoint — skip already-processed books
+        if title in completed and not args.dry_run:
+            result = completed[title]
+            results.append(result)
+            log.info(f"[{i}/{len(targets)}] {title} — SKIPPED (checkpoint: {result['status']})")
+            continue
+
+        log.info(f"[{i}/{len(targets)}] {title} -- {author}")
+        result = process_book(title, author, md5, subdir, args.dry_run)
+        results.append(result)
+        log.info(f"  -> {result['status']}")
+
+        # Save checkpoint after each book (not in dry-run)
+        if not args.dry_run:
+            completed[title] = result
+            save_checkpoint(completed)
+
+        time.sleep(random.uniform(6, 12))
+
+    write_report(results)
+    print(f"\n-- Pass 2 Summary ----------------------------------------")
+    for status in ["DOWNLOADED (Z-Library)", "DOWNLOADED (IPFS)", "DOWNLOADED (mirror)", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN"]:
+        count = sum(1 for r in results if r["status"] == status)
+        if count:
+            print(f"  {status:<35} {count:>3}")
+    print(f"  Report: {REPORT_OUT}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/backup.sh
+++ b/scripts/backup.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+# RECON Backup Script
+# Backs up the precious data: concept JSONs, text extracts, SQLite DB
+# Qdrant is NOT backed up — rebuilt from JSONs via `recon rebuild`
+# Destination: Contabo VPS (100.64.0.1) via rsync+SSH
+
+set -euo pipefail
+
+RECON_DIR="/opt/recon"
+DATA_DIR="$RECON_DIR/data"
+LOG_FILE="$RECON_DIR/logs/backup.log"
+DATE=$(date +%Y%m%d_%H%M%S)
+
+BACKUP_HOST="root@100.64.0.1"
+BACKUP_BASE="/opt/backups/recon"
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+mkdir -p "$RECON_DIR/logs"
+
+log "=== RECON Backup Starting ==="
+
+# ── 1. SQLite DB (small, fast, critical) ──
+log "Backing up recon.db..."
+LOCAL_DB_BACKUP="/tmp/recon_${DATE}.db"
+sqlite3 "$DATA_DIR/recon.db" ".backup '$LOCAL_DB_BACKUP'"
+rsync -az "$LOCAL_DB_BACKUP" "$BACKUP_HOST:$BACKUP_BASE/recon_${DATE}.db"
+rm -f "$LOCAL_DB_BACKUP"
+# Keep last 7 daily DB backups on remote
+ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/recon_*.db 2>/dev/null | tail -n +8 | xargs rm -f 2>/dev/null || true"
+log "  recon.db backed up"
+
+# ── 2. Concept JSONs (THE PRECIOUS DATA — $130+ of Gemini work) ──
+log "Syncing concept JSONs..."
+rsync -az --delete "$DATA_DIR/concepts/" "$BACKUP_HOST:$BACKUP_BASE/concepts/"
+CONCEPT_COUNT=$(find "$DATA_DIR/concepts/" -name "*.json" 2>/dev/null | wc -l)
+log "  concepts synced ($CONCEPT_COUNT JSON files)"
+
+# ── 3. Text extracts (regenerable but expensive in time) ──
+log "Syncing text extracts..."
+rsync -az --delete "$DATA_DIR/text/" "$BACKUP_HOST:$BACKUP_BASE/text/"
+TEXT_COUNT=$(find "$DATA_DIR/text/" -maxdepth 1 -type d 2>/dev/null | wc -l)
+log "  text synced ($((TEXT_COUNT - 1)) document dirs)"
+
+# ── 4. Intel feeds ──
+if [ -d "$DATA_DIR/intel" ]; then
+    log "Syncing intel feeds..."
+    rsync -az --delete "$DATA_DIR/intel/" "$BACKUP_HOST:$BACKUP_BASE/intel/"
+    log "  intel synced"
+fi
+
+# ── 5. Config files ──
+log "Backing up config..."
+rsync -az "$RECON_DIR/config.yaml" "$BACKUP_HOST:$BACKUP_BASE/config_${DATE}.yaml"
+rsync -az "$RECON_DIR/.env" "$BACKUP_HOST:$BACKUP_BASE/env_${DATE}" 2>/dev/null || true
+ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/config_*.yaml 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
+ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/env_* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
+log "  config backed up"
+
+# ── Summary ──
+BACKUP_SIZE=$(ssh "$BACKUP_HOST" "du -sh $BACKUP_BASE" | cut -f1)
+log "=== Backup Complete: $BACKUP_SIZE on Contabo ==="
--- a/scripts/cleanup_outliers.py
+++ b/scripts/cleanup_outliers.py
@ -0,0 +1,449 @@
+#!/usr/bin/env python3
+"""
+cleanup_outliers.py — Three-pass cleanup of RECON concept data.
+
+Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
+Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
+Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB
+
+Usage:
+  python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+import sqlite3
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+from qdrant_client import QdrantClient
+from qdrant_client.models import FieldCondition, MatchAny, Filter
+
+import sys, os
+sys.path.insert(0, '/opt/recon')
+from lib.utils import get_config, setup_logging
+
+LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("cleanup_outliers")
+
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+DB_PATH = Path("/opt/recon/data/recon.db")
+
+CANONICAL_DOMAINS = {
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+}
+
+# Non-canonical → canonical remap
+OUTLIER_MAP = {
+    "Zoology":                  "Sustainment Systems",
+    "Botany":                   "Sustainment Systems",
+    "Nature Lore":              "Sustainment Systems",
+    "Ecology":                  "Sustainment Systems",
+    "Navigational Astronomy":   "Navigation",
+    "Troubleshooting":          "Foundational Skills",
+    "Chemistry":                "Foundational Skills",
+    "Metallurgy":               "Foundational Skills",
+    "Weird Science":            "Foundational Skills",
+    "Philosophy of physics":    "Foundational Skills",
+    "Physics":                  "Foundational Skills",
+    "Cell biology":             "Foundational Skills",
+    "Economics":                "Leadership",
+    "Business":                 "Leadership",
+    "Safety":                   "Security",
+    "Law Enforcement":          "Security",
+    "Security & Intelligence":  "Security",
+    "Fire Weather":             "Scenario Playbooks",
+    "Legal":                    "Leadership",
+    # Discard — replace with closest real domain
+    "Site News":                "Foundational Skills",
+    "Paleogeography":           "Foundational Skills",
+    "Chemical Manipulation":    "Foundational Skills",
+}
+
+# Junk URL patterns — pages with no knowledge value
+JUNK_URL_PATTERNS = [
+    # rocketstoves.com nav/template garbage
+    "rocketstoves.com/favicon",
+    "rocketstoves.com/cropped-favicon",
+    "rocketstoves.com/layouts/",
+    "rocketstoves.com/sample",
+    "rocketstoves.com/templates/",
+    "rocketstoves.com/hello-world",
+    "rocketstoves.com/blog-forthcoming",
+    "rocketstoves.com/contact",
+    "rocketstoves.com/acknowledgements",
+    "rocketstoves.com/ja3",
+    "rocketstoves.com/juxtapositions",
+    "rocketstoves.com/no-name-soi",
+    "rocketstoves.com/big4",
+    "rocketstoves.com/roof",
+    "rocketstoves.com/rmh_dloadcover",
+    "rocketstoves.com/pedcover",
+    "rocketstoves.com/laundry-to-landscape",
+    "rocketstoves.com/barreloven",
+    # NRCS calendar/event noise
+    "nrcs.usda.gov/events/",
+    "nrcs.usda.gov/state-offices/massachusetts",
+    "nrcs.usda.gov/state-offices/nebraska",
+    "nrcs.usda.gov/state-offices/oklahoma",
+    "nrcs.usda.gov/state-offices/utah",
+    "nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
+    # deeranddeerhunting trophy hunt videos (no knowledge value)
+    "deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
+    # eattheweeds non-content pages
+    "eattheweeds.com/media-interviews-with-green-deane",
+    "eattheweeds.com/motorcycles-and-mushrooms",
+    "eattheweeds.com/sunny-savage",
+    # foragersharvest nav pages
+    "foragersharvest.com/contact",
+    "foragersharvest.com/podcasts",
+    # motherearthnews classifieds/nav
+    "motherearthnews.com/classifieds/",
+    "motherearthnews.com/biographies/",
+]
+
+CLASSIFY_PROMPT = """\
+Classify this knowledge concept into one or more domains.
+
+VALID DOMAINS (use ONLY these exact strings):
+  Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
+  Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
+  Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
+
+Concept title: {title}
+Concept tags: {subdomain}
+Concept preview: {content}
+
+Return ONLY valid JSON, no markdown:
+{{"domain": ["Domain Name"]}}
+
+Rules:
+- Never return empty domain list
+- Medical content, herbs, first aid, veterinary → Medical
+- Food growing, foraging, hunting, livestock → Sustainment Systems
+- Food preservation, canning, storage → Food Systems
+- Solar, wind, batteries, generators → Power Systems
+- Water sourcing, filtration, sanitation → Water Systems
+"""
+
+def load_gemini_keys():
+    keys = []
+    for line in Path("/opt/recon/.env").read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def classify_concept(title, subdomains, content, key):
+    prompt = CLASSIFY_PROMPT.format(
+        title=title or "(untitled)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=str(content)[:300] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for attempt in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
+            if domains:
+                return domains
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503"]):
+                time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
+            else:
+                break
+    return ["Foundational Skills"]
+
+# ── PASS 1: Remap outlier domains ────────────────────────────────────────────
+
+def remap_concept_domains(domains):
+    """Remap any outlier domain names in a domain list."""
+    result = set()
+    changed = False
+    for d in domains:
+        if d in CANONICAL_DOMAINS:
+            result.add(d)
+        elif d in OUTLIER_MAP:
+            result.add(OUTLIER_MAP[d])
+            changed = True
+        else:
+            changed = True  # drop unknown
+    return list(result), changed
+
+def pass1_remap_outliers(qdrant, collection, dry_run):
+    log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
+    outlier_names = list(OUTLIER_MAP.keys())
+    stats = defaultdict(int)
+
+    # Scroll through Qdrant finding affected vectors
+    offset = None
+    affected_points = []
+
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(
+                    key="domain",
+                    match=MatchAny(any=outlier_names)
+                )]
+            ),
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        affected_points.extend(results)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")
+
+    for point in affected_points:
+        payload = point.payload
+        old_domains = payload.get("domain", [])
+        if isinstance(old_domains, str):
+            old_domains = [old_domains]
+
+        new_domains, changed = remap_concept_domains(old_domains)
+        if not new_domains:
+            new_domains = ["Foundational Skills"]
+
+        if changed:
+            stats["qdrant_updated"] += 1
+            if not dry_run:
+                qdrant.set_payload(
+                    collection_name=collection,
+                    payload={"domain": new_domains},
+                    points=[point.id],
+                )
+
+    # Also fix concept JSON files on disk
+    json_fixed = 0
+    for window_file in CONCEPTS_DIR.rglob("window_*.json"):
+        try:
+            with open(window_file, "r", encoding="utf-8") as f:
+                concepts = json.load(f)
+        except Exception:
+            continue
+
+        if not isinstance(concepts, list):
+            continue
+
+        file_changed = False
+        for concept in concepts:
+            if not isinstance(concept, dict):
+                continue
+            raw = concept.get("domain", [])
+            if isinstance(raw, str):
+                raw = [raw]
+            new, changed = remap_concept_domains(raw)
+            if changed:
+                concept["domain"] = new if new else ["Foundational Skills"]
+                file_changed = True
+
+        if file_changed:
+            json_fixed += 1
+            if not dry_run:
+                with open(window_file, "w", encoding="utf-8") as f:
+                    json.dump(concepts, f, indent=2, ensure_ascii=False)
+
+    log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
+    return stats
+
+# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────
+
+def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
+    log.info("=== PASS 2: Re-enriching empty domain concepts ===")
+    stats = defaultdict(int)
+
+    # Find empty domain points in Qdrant
+    offset = None
+    empty_points = []
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        for r in results:
+            d = r.payload.get("domain", [])
+            if not d or d == [] or d == [""]:
+                empty_points.append(r)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(empty_points)} points with empty domains")
+
+    for point in empty_points:
+        payload = point.payload
+        title = payload.get("title", "")
+        subdomains = payload.get("subdomain", [])
+        content = payload.get("content", payload.get("summary", ""))
+
+        key = key_rotator.next()
+        new_domains = classify_concept(title, subdomains, content, key)
+        stats["classified"] += 1
+
+        if not dry_run:
+            qdrant.set_payload(
+                collection_name=collection,
+                payload={"domain": new_domains},
+                points=[point.id],
+            )
+
+        # Also update the concept JSON on disk
+        doc_hash = payload.get("doc_hash", "")
+        if doc_hash:
+            doc_concepts_dir = CONCEPTS_DIR / doc_hash
+            if doc_concepts_dir.exists():
+                for wf in doc_concepts_dir.glob("window_*.json"):
+                    try:
+                        with open(wf, "r", encoding="utf-8") as f:
+                            concepts = json.load(f)
+                        changed = False
+                        for c in concepts:
+                            if isinstance(c, dict) and c.get("title") == title:
+                                d = c.get("domain", [])
+                                if not d or d == []:
+                                    c["domain"] = new_domains
+                                    changed = True
+                        if changed and not dry_run:
+                            with open(wf, "w", encoding="utf-8") as f:
+                                json.dump(concepts, f, indent=2, ensure_ascii=False)
+                    except Exception:
+                        pass
+
+        time.sleep(0.05)
+
+    log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
+    return stats
+
+# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────
+
+def is_junk_url(url):
+    url_lower = url.lower()
+    return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)
+
+def pass3_purge_junk(qdrant, collection, dry_run):
+    log.info("=== PASS 3: Purging junk URLs ===")
+    stats = defaultdict(int)
+
+    # Scroll all web-source points and find junk
+    offset = None
+    junk_point_ids = []
+    junk_doc_hashes = set()
+
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
+            ),
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        for r in results:
+            filename = r.payload.get("filename", "")
+            doc_hash = r.payload.get("doc_hash", "")
+            if is_junk_url(filename):
+                junk_point_ids.append(r.id)
+                if doc_hash:
+                    junk_doc_hashes.add(doc_hash)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")
+
+    if not dry_run and junk_point_ids:
+        # Delete in batches
+        batch_size = 500
+        for i in range(0, len(junk_point_ids), batch_size):
+            batch = junk_point_ids[i:i + batch_size]
+            qdrant.delete(collection_name=collection, points_selector=batch)
+        log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")
+
+        # Mark junk docs as skipped in SQLite
+        conn = sqlite3.connect(str(DB_PATH))
+        for doc_hash in junk_doc_hashes:
+            conn.execute(
+                "UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
+                (doc_hash,)
+            )
+        conn.commit()
+        conn.close()
+        log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")
+
+    stats["junk_vectors"] = len(junk_point_ids)
+    stats["junk_docs"] = len(junk_doc_hashes)
+    log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--skip-pass", type=int, action="append", default=[])
+    args = parser.parse_args()
+
+    config = get_config()
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    qdrant = QdrantClient(
+        host=config['vector_db']['host'],
+        port=config['vector_db']['port'],
+        timeout=60
+    )
+    collection = config['vector_db']['collection']
+
+    log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")
+
+    if 1 not in args.skip_pass:
+        pass1_remap_outliers(qdrant, collection, args.dry_run)
+
+    if 2 not in args.skip_pass:
+        pass2_empty_domains(qdrant, collection, rotator, args.dry_run)
+
+    if 3 not in args.skip_pass:
+        pass3_purge_junk(qdrant, collection, args.dry_run)
+
+    log.info("All passes complete.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/domain_reenrich.py
+++ b/scripts/domain_reenrich.py
@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+domain_reenrich.py — Re-enriches solo-Reference concepts that domain_remap.py
+couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
+with a lightweight classification-only prompt, updates domain in-place.
+
+Usage:
+  python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
+
+Reads:  /opt/recon/data/remap_unknowns.jsonl
+Writes: domain field in-place in window JSON files
+Log:    /opt/recon/logs/domain_reenrich.log
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+
+UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
+LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(),
+    ]
+)
+log = logging.getLogger("domain_reenrich")
+
+CANONICAL_DOMAINS = [
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+]
+
+DOMAIN_SET = set(CANONICAL_DOMAINS)
+
+CLASSIFY_PROMPT = """\
+Classify this knowledge concept into one or more domains.
+
+VALID DOMAINS (use ONLY these exact strings, no others):
+{domains}
+
+Concept title: {title}
+Concept tags: {subdomain}
+Concept preview: {content}
+
+Return ONLY valid JSON, no markdown, no explanation:
+{{"domain": ["Domain Name"]}}
+
+Rules:
+- Use only the domain strings listed above, spelled exactly
+- If genuinely multi-domain assign all that apply
+- Never return empty domain list — pick the closest match
+- Medical content, herbs, first aid, veterinary → Medical
+- Food growing, foraging, hunting, livestock → Sustainment Systems
+- Food preservation, canning, storage → Food Systems
+- Solar, wind, batteries, generators → Power Systems
+- Water sourcing, filtration, sanitation → Water Systems
+"""
+
+def load_gemini_keys():
+    env = Path("/opt/recon/.env")
+    keys = []
+    for line in env.read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def classify_concept(title, subdomains, content, key):
+    prompt = CLASSIFY_PROMPT.format(
+        domains="\n".join(f"  {d}" for d in CANONICAL_DOMAINS),
+        title=title or "(untitled)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=content[:300] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for attempt in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
+            if domains:
+                return domains
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
+                time.sleep(delay)
+            else:
+                break
+    return ["Foundational Skills"]  # last-resort fallback
+
+def process_unknown(item, key_rotator):
+    filepath = Path(item["filepath"])
+    title = item.get("title", "")
+    subdomains = item.get("subdomain", [])
+    content = item.get("content_preview", "")
+
+    if not filepath.exists():
+        return "file_missing"
+
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            concepts = json.load(f)
+    except Exception:
+        return "read_error"
+
+    if not isinstance(concepts, list):
+        return "not_list"
+
+    # Find this concept by title and update its domain
+    matched = False
+    for concept in concepts:
+        if not isinstance(concept, dict):
+            continue
+        if concept.get("title", "") == title:
+            raw = concept.get("domain", [])
+            if isinstance(raw, str):
+                raw = [raw]
+            # Only re-enrich if still stuck on Reference
+            if raw == ["Reference"] or raw == []:
+                key = key_rotator.next()
+                new_domains = classify_concept(title, subdomains, content, key)
+                concept["domain"] = new_domains
+                concept["_reenriched"] = True
+                matched = True
+                break
+
+    if not matched:
+        return "already_fixed"
+
+    try:
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(concepts, f, indent=2, ensure_ascii=False)
+    except Exception:
+        return "write_error"
+
+    return "ok"
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    keys = load_gemini_keys()
+    if not keys:
+        log.error("No Gemini keys found in .env")
+        return
+    rotator = KeyRotator(keys)
+
+    unknowns = []
+    with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                unknowns.append(json.loads(line))
+
+    if args.limit:
+        unknowns = unknowns[:args.limit]
+
+    total = len(unknowns)
+    log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
+    log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
+
+    results = defaultdict(int)
+    lock = threading.Lock()
+    done = 0
+
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
+        for future in as_completed(futures):
+            status = future.result()
+            with lock:
+                results[status] += 1
+                done += 1
+                if done % 5000 == 0:
+                    pct = done / total * 100
+                    log.info(f"  Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
+            time.sleep(0.05)
+
+    log.info("── Final Results ─────────────────────────────────────────────")
+    for status, count in sorted(results.items(), key=lambda x: -x[1]):
+        log.info(f"  {status:<25} {count:>10,}")
+    log.info(f"  Total: {total:,}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/domain_remap.py
+++ b/scripts/domain_remap.py
@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+"""
+domain_remap.py — Fix RECON concept domain classifications without API calls.
+
+What this does:
+  1. Strips "Reference" from concepts that have other real domains
+  2. Remaps variant domain spellings to canonical names
+  3. Reclassifies solo-Reference concepts using their subdomain tags
+  4. Writes a JSONL file of true unknowns for API re-enrichment
+
+Each window file is a JSON array of concept dicts.
+Field names: "domain" (list), "subdomain" (list)
+
+Usage:
+  python3 /opt/recon/scripts/domain_remap.py --dry-run   # report only
+  python3 /opt/recon/scripts/domain_remap.py             # apply fixes
+  python3 /opt/recon/scripts/domain_remap.py --workers 16
+"""
+
+import json
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+UNKNOWNS_OUTPUT = Path("/opt/recon/data/remap_unknowns.jsonl")
+
+CANONICAL_DOMAINS = {
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+}
+
+# Variant → Canonical mapping
+VARIANT_MAP = {
+    # Defense & Tactics
+    "Tactical Ops": "Defense & Tactics",
+    "Tactical_Ops": "Defense & Tactics",
+    "Tactical Operations": "Defense & Tactics",
+    "Tactical": "Defense & Tactics",
+    "Tactical Skills": "Defense & Tactics",
+    "Tactics": "Defense & Tactics",
+    "Tactics & Defense": "Defense & Tactics",
+    "Reconnaissance": "Defense & Tactics",
+    "Fire Support": "Defense & Tactics",
+    "Improvised Munitions": "Defense & Tactics",
+    "Military Intelligence": "Defense & Tactics",
+    "Military History": "Defense & Tactics",
+    "Military Engineering": "Defense & Tactics",
+    # Medical
+    "Medical Care": "Medical",
+    "Medical Alternatives": "Medical",
+    "Medical/Dental": "Medical",
+    "Medical & Dental": "Medical",
+    "medical": "Medical",
+    "Medical Awareness": "Medical",
+    "Medical Disasters": "Medical",
+    "Medical Emergency Survival": "Medical",
+    "Medical Procedures": "Medical",
+    "Medical Treatment": "Medical",
+    "Medical Science": "Medical",
+    "Medical History": "Medical",
+    "Medical Diagnosis": "Medical",
+    "Medical Skills": "Medical",
+    "Medical Supply": "Medical",
+    "Medical Gear": "Medical",
+    "Medical Kits": "Medical",
+    "Medical Logistics": "Logistics",
+    "Medical First Aid": "Medical",
+    "Medical Ethics": "Medical",
+    "Medical Reference Ranges": "Medical",
+    "Medical andSurgical Hints": "Medical",
+    "Medical Aspects of Radiation Injury": "Medical",
+    "Medical Uses": "Medical",
+    "Medical Care in Developing Countries": "Medical",
+    "Survival Medicine": "Medical",
+    "Emergency War Surgery": "Medical",
+    "First Aid": "Medical",
+    "First Aid and Life Saving": "Medical",
+    "Veterinary Medicine": "Medical",
+    "Veterinary Hygiene": "Medical",
+    "Veterinary": "Medical",
+    "Pharmacology": "Medical",
+    "Public Health": "Medical",
+    "Health": "Medical",
+    # Food Systems
+    "Food_Systems": "Food Systems",
+    "Food_systems": "Food Systems",
+    "food_systems": "Food Systems",
+    "Food Preservation": "Food Systems",
+    "Food Safety": "Food Systems",
+    "Food Security": "Food Systems",
+    "Food & Nutrition": "Food Systems",
+    "Diet & Nutrition": "Food Systems",
+    "Culinary Arts": "Food Systems",
+    "Foodprocessing": "Food Systems",
+    "Food": "Food Systems",
+    # Sustainment Systems
+    "Sustainment_Systems": "Sustainment Systems",
+    "Agriculture": "Sustainment Systems",
+    "Agriculture & Natural Resources": "Sustainment Systems",
+    "Agriculture and Natural Resources": "Sustainment Systems",
+    "Horticulture": "Sustainment Systems",
+    "Gardening": "Sustainment Systems",
+    "Hydroponics": "Sustainment Systems",
+    "Survival Skills": "Sustainment Systems",
+    # Foundational Skills
+    "Foundational_Skills": "Foundational Skills",
+    "Primitive Living Skills": "Foundational Skills",
+    "Woodcraft": "Foundational Skills",
+    "Home Workshop": "Foundational Skills",
+    "Science": "Foundational Skills",
+    "Engineering": "Foundational Skills",
+    "Construction": "Foundational Skills",
+    "Industrial Processes": "Foundational Skills",
+    "Machine Technology": "Foundational Skills",
+    "Training": "Foundational Skills",
+    "Education": "Foundational Skills",
+    # Off-Grid Systems
+    "Off-Grid_Systems": "Off-Grid Systems",
+    "Appropriate Technology": "Off-Grid Systems",
+    # Power Systems
+    "Homebrewed Electricity": "Power Systems",
+    "Renewable Energy": "Power Systems",
+    "Renewable Energy FAQs": "Power Systems",
+    "Alternative Fuels": "Power Systems",
+    "Power_Systems": "Power Systems",
+    # Water Systems
+    "Water_Systems": "Water Systems",
+    # Community Coordination
+    "Community_Coordination": "Community Coordination",
+    "Community_coordination": "Community Coordination",
+    "Community": "Community Coordination",
+    # Leadership
+    "Leadership & Planning": "Leadership",
+    "Planning": "Leadership",
+    "Administration": "Leadership",
+    "Governance": "Leadership",
+    "Government": "Leadership",
+    # Communications
+    "Emergency Communications": "Communications",
+    # Security
+    "Security Systems": "Security",
+    # Logistics
+    "Transportation": "Logistics",
+    # Scenario Playbooks
+    "General Preparedness": "Scenario Playbooks",
+    "Emergency Preparedness": "Scenario Playbooks",
+    "Emergency Management": "Scenario Playbooks",
+    "Wilderness Preparedness": "Scenario Playbooks",
+    "Urban Preparedness": "Scenario Playbooks",
+    "Winter Preparedness": "Scenario Playbooks",
+    # Discard (noise domains)
+    "Humor": None,
+    "Recreation": None,
+    "Business": None,
+    "Finance": None,
+    "Economics": None,
+    "Economics/Finances": None,
+    "Weird Science": None,
+}
+
+# Subdomain keyword → canonical domain (for solo-Reference reclassification)
+SUBDOMAIN_MAP = {
+    "first aid": "Medical",
+    "emergency care": "Medical",
+    "emergency medicine": "Medical",
+    "trauma": "Medical",
+    "anatomy": "Medical",
+    "oral rehydration": "Medical",
+    "ors": "Medical",
+    "pharmacology": "Medical",
+    "toxicology": "Medical",
+    "antidote": "Medical",
+    "nerve agent": "Defense & Tactics",
+    "chemical warfare": "Defense & Tactics",
+    "biological warfare": "Defense & Tactics",
+    "nbc": "Defense & Tactics",
+    "infectious disease": "Medical",
+    "microbiology": "Medical",
+    "virology": "Medical",
+    "bacteriology": "Medical",
+    "pediatric": "Medical",
+    "surgery": "Medical",
+    "wound care": "Medical",
+    "veterinary": "Medical",
+    "dental": "Medical",
+    "dentistry": "Medical",
+    "herbal": "Medical",
+    "medicinal plant": "Medical",
+    "medicinal herb": "Medical",
+    "herbalism": "Medical",
+    "food preservation": "Food Systems",
+    "canning": "Food Systems",
+    "fermentation": "Food Systems",
+    "food storage": "Food Systems",
+    "food safety": "Food Systems",
+    "cooking": "Food Systems",
+    "food processing": "Food Systems",
+    "agriculture": "Sustainment Systems",
+    "soil": "Sustainment Systems",
+    "permaculture": "Sustainment Systems",
+    "agroforestry": "Sustainment Systems",
+    "livestock": "Sustainment Systems",
+    "animal husbandry": "Sustainment Systems",
+    "beekeeping": "Sustainment Systems",
+    "foraging": "Sustainment Systems",
+    "hunting": "Sustainment Systems",
+    "fishing": "Sustainment Systems",
+    "gardening": "Sustainment Systems",
+    "mycology": "Sustainment Systems",
+    "mushroom": "Sustainment Systems",
+    "water purification": "Water Systems",
+    "water filtration": "Water Systems",
+    "water sanitation": "Water Systems",
+    "water disinfection": "Water Systems",
+    "water storage": "Water Systems",
+    "well construction": "Water Systems",
+    "rainwater": "Water Systems",
+    "solar": "Power Systems",
+    "wind turbine": "Power Systems",
+    "battery": "Power Systems",
+    "batteries": "Power Systems",
+    "generator": "Power Systems",
+    "photovoltaic": "Power Systems",
+    "charge controller": "Power Systems",
+    "inverter": "Power Systems",
+    "biogas": "Off-Grid Systems",
+    "biomass": "Off-Grid Systems",
+    "wood gasification": "Off-Grid Systems",
+    "rocket stove": "Off-Grid Systems",
+    "mechanical system": "Off-Grid Systems",
+    "power transmission": "Off-Grid Systems",
+    "radio": "Communications",
+    "ham radio": "Communications",
+    "amateur radio": "Communications",
+    "antenna": "Communications",
+    "meshtastic": "Communications",
+    "encryption": "Communications",
+    "navigation": "Navigation",
+    "celestial navigation": "Navigation",
+    "land navigation": "Navigation",
+    "map reading": "Navigation",
+    "compass": "Navigation",
+    "pottery": "Foundational Skills",
+    "ceramics": "Foundational Skills",
+    "blacksmithing": "Foundational Skills",
+    "woodworking": "Foundational Skills",
+    "leatherwork": "Foundational Skills",
+    "textile": "Foundational Skills",
+    "masonry": "Foundational Skills",
+    "metalworking": "Foundational Skills",
+    "historical technology": "Foundational Skills",
+    "weapons": "Defense & Tactics",
+    "firearms": "Defense & Tactics",
+    "ballistics": "Defense & Tactics",
+    "tactics": "Defense & Tactics",
+    "perimeter": "Security",
+    "surveillance": "Security",
+    "supply chain": "Logistics",
+    "logistics": "Logistics",
+    "leadership": "Leadership",
+    "governance": "Leadership",
+    "community": "Community Coordination",
+    "emergency preparedness": "Scenario Playbooks",
+    "disaster": "Scenario Playbooks",
+    "evacuation": "Scenario Playbooks",
+}
+
+
+def remap_domains(domains):
+    """Remap a list of domain strings — variants to canonical, strip Reference."""
+    result = set()
+    for d in domains:
+        if d == "Reference":
+            continue
+        if d in CANONICAL_DOMAINS:
+            result.add(d)
+        elif d in VARIANT_MAP:
+            mapped = VARIANT_MAP[d]
+            if mapped:  # None means discard
+                result.add(mapped)
+        # Unknown non-canonical domains: drop them
+    return list(result)
+
+
+def classify_by_subdomain(subdomains):
+    """Try to infer canonical domain(s) from subdomain keyword matching."""
+    found = set()
+    for sd in subdomains:
+        sd_lower = sd.lower().strip()
+        for key, domain in SUBDOMAIN_MAP.items():
+            if key in sd_lower:
+                found.add(domain)
+    return list(found) if found else None
+
+
+def process_window_file(filepath, dry_run):
+    """Process one window JSON file (array of concepts). Returns per-file stats."""
+    stats = defaultdict(int)
+    unknowns = []
+
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            concepts = json.load(f)
+    except Exception as e:
+        return {"parse_error": 1}, []
+
+    if not isinstance(concepts, list):
+        return {"skip_not_list": 1}, []
+
+    modified = False
+
+    for concept in concepts:
+        if not isinstance(concept, dict):
+            continue
+
+        raw_domains = concept.get("domain", [])
+        if isinstance(raw_domains, str):
+            raw_domains = [raw_domains]
+
+        subdomains = concept.get("subdomain", [])
+        if isinstance(subdomains, str):
+            subdomains = [subdomains]
+
+        has_reference = "Reference" in raw_domains
+        non_reference = [d for d in raw_domains if d != "Reference"]
+
+        if not has_reference:
+            # No Reference — just fix any variant names
+            remapped = remap_domains(raw_domains)
+            if set(remapped) != set(raw_domains):
+                concept["domain"] = remapped
+                modified = True
+                stats["variant_remapped"] += 1
+            else:
+                stats["no_change"] += 1
+            continue
+
+        # Has Reference — what else does it have?
+        remapped_others = remap_domains(non_reference)
+
+        if remapped_others:
+            # Reference + real domains: drop Reference, keep the rest
+            concept["domain"] = remapped_others
+            modified = True
+            stats["reference_stripped"] += 1
+            continue
+
+        # Solo Reference (or Reference + only-noise): try subdomain lookup
+        inferred = classify_by_subdomain(subdomains)
+        if inferred:
+            concept["domain"] = inferred
+            concept["_reclassified_from_reference"] = True
+            modified = True
+            stats["subdomain_reclassified"] += 1
+            continue
+
+        # True unknown — needs API re-enrichment
+        unknowns.append({
+            "filepath": str(filepath),
+            "title": concept.get("title", ""),
+            "subdomain": subdomains,
+            "content_preview": str(concept.get("content", concept.get("summary", "")))[:300],
+        })
+        stats["needs_enrichment"] += 1
+
+    if modified and not dry_run:
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(concepts, f, indent=2, ensure_ascii=False)
+
+    return dict(stats), unknowns
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Remap RECON concept domains")
+    parser.add_argument("--dry-run", action="store_true", help="Report without writing")
+    parser.add_argument("--workers", type=int, default=16)
+    args = parser.parse_args()
+
+    print(f"[REMAP] Scanning {CONCEPTS_DIR}")
+    print(f"[REMAP] Dry run: {args.dry_run} | Workers: {args.workers}")
+
+    window_files = [
+        f for f in CONCEPTS_DIR.rglob("window_*.json")
+    ]
+    print(f"[REMAP] Found {len(window_files):,} window files")
+
+    total_stats = defaultdict(int)
+    all_unknowns = []
+    lock = threading.Lock()
+    done = 0
+
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futures = {ex.submit(process_window_file, f, args.dry_run): f for f in window_files}
+        for future in as_completed(futures):
+            file_stats, unknowns = future.result()
+            with lock:
+                for k, v in file_stats.items():
+                    total_stats[k] += v
+                all_unknowns.extend(unknowns)
+                done += 1
+                if done % 5000 == 0:
+                    print(f"  {done:,}/{len(window_files):,} files processed...")
+
+    print("\n── Results ─────────────────────────────────────────────────")
+    for status, count in sorted(total_stats.items(), key=lambda x: -x[1]):
+        print(f"  {status:<35} {count:>10,}")
+
+    total_concepts = sum(total_stats.values())
+    print(f"\n  Total concepts processed:       {total_concepts:>10,}")
+    print(f"  True unknowns for re-enrichment:{len(all_unknowns):>10,}")
+
+    if not args.dry_run and all_unknowns:
+        with open(UNKNOWNS_OUTPUT, "w", encoding="utf-8") as f:
+            for item in all_unknowns:
+                f.write(json.dumps(item) + "\n")
+        print(f"\n  Unknowns written to: {UNKNOWNS_OUTPUT}")
+
+    if args.dry_run:
+        print("\n  [DRY RUN] No files were modified.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/migrate_domains.py
+++ b/scripts/migrate_domains.py
@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+"""
+migrate_domains.py — Reclassify 5 legacy domains via Gemini Flash.
+
+Targets: Sustainment Systems, Off-Grid Systems, Defense & Tactics,
+         Community Coordination, Leadership
+
+Maps each to one of the 18 approved domains. 16 parallel workers,
+checkpoint file, crash-safe, incremental saves, progress every 5,000.
+
+Usage:
+  python3 /tmp/migrate_domains.py [--dry-run] [--workers 16] [--limit N]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+from qdrant_client import QdrantClient
+from qdrant_client.models import FieldCondition, MatchValue, Filter
+
+# Suppress noisy HTTP logs
+import logging as _logging
+_logging.getLogger("httpx").setLevel(_logging.WARNING)
+_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
+
+LOG_FILE = Path("/opt/recon/logs/migrate_domains.log")
+CHECKPOINT_FILE = Path("/opt/recon/data/migrate_domains_checkpoint.json")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("migrate_domains")
+
+# ── Constants ───────────────────────────────────────────────────────────────
+
+VALID_DOMAINS = {
+    'Agriculture & Livestock', 'Civil Organization', 'Communications',
+    'Food Systems', 'Foundational Skills', 'Logistics', 'Medical',
+    'Navigation', 'Operations', 'Power Systems', 'Preservation & Storage',
+    'Security', 'Shelter & Construction', 'Technology', 'Tools & Equipment',
+    'Vehicles', 'Water Systems', 'Wilderness Skills',
+}
+
+SOURCE_DOMAINS = {
+    'Sustainment Systems', 'Off-Grid Systems', 'Defense & Tactics',
+    'Community Coordination', 'Leadership',
+}
+
+DOMAIN_LIST_STR = ', '.join(sorted(VALID_DOMAINS))
+
+CLASSIFY_PROMPT = """\
+Classify this knowledge concept into exactly one domain from this list:
+Agriculture & Livestock, Civil Organization, Communications, Food Systems, Foundational Skills, Logistics, Medical, Navigation, Operations, Power Systems, Preservation & Storage, Security, Shelter & Construction, Technology, Tools & Equipment, Vehicles, Water Systems, Wilderness Skills
+
+Return ONLY the exact domain string, nothing else. No explanation, no punctuation, no quotes.
+
+Content: {content}
+Summary: {summary}
+Subdomain: {subdomain}
+"""
+
+DOMAIN_FALLBACK = 'Foundational Skills'
+
+# ── Key management ──────────────────────────────────────────────────────────
+
+def load_gemini_keys():
+    keys = []
+    env_path = Path("/opt/recon/.env")
+    if not env_path.exists():
+        raise FileNotFoundError(f"{env_path} not found")
+    for line in env_path.read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    if not keys:
+        raise ValueError("No GEMINI_KEY_* found in .env")
+    return keys
+
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+
+# ── Classification ──────────────────────────────────────────────────────────
+
+def classify_domain(content, summary, subdomains, key):
+    """Call Gemini Flash to classify into one of 18 domains."""
+    prompt = CLASSIFY_PROMPT.format(
+        content=str(content)[:400] if content else "(none)",
+        summary=str(summary)[:200] if summary else "(none)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "text/plain"}
+    )
+
+    for retry in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            value = resp.text.strip().strip('"').strip("'").strip()
+            if value in VALID_DOMAINS:
+                return value
+            # Try case-insensitive match
+            for valid in VALID_DOMAINS:
+                if value.lower() == valid.lower():
+                    return valid
+            # Partial match — Gemini sometimes returns with trailing period
+            clean = value.rstrip('.')
+            if clean in VALID_DOMAINS:
+                return clean
+            # Invalid — retry with stricter prompt
+            if retry < 3:
+                prompt = (
+                    f"Your previous response '{value}' was invalid. "
+                    f"You must return ONLY one of these exact strings: {DOMAIN_LIST_STR}\n\n"
+                    f"Content: {str(content)[:300]}\n"
+                    f"Return ONLY the exact domain string."
+                )
+                continue
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
+            else:
+                log.warning(f"Gemini error (attempt {retry+1}): {e}")
+                if retry >= 2:
+                    break
+
+    return heuristic_fallback(content, summary, subdomains)
+
+
+def heuristic_fallback(content, summary, subdomains):
+    """Last-resort heuristic when Gemini fails or returns invalid."""
+    text = f"{summary or ''} {' '.join(subdomains or [])} {str(content or '')[:200]}".lower()
+
+    mapping = [
+        (["farming", "agriculture", "livestock", "animal husbandry", "poultry",
+          "cattle", "crop", "soil fertility", "irrigation for crops"], "Agriculture & Livestock"),
+        (["foraging", "hunting", "fishing", "bushcraft", "wilderness", "survival skill",
+          "fire starting", "shelter building", "trapping", "tracking"], "Wilderness Skills"),
+        (["food preservation", "canning", "dehydration", "smoking", "pickling",
+          "fermentation", "food storage", "freeze dry"], "Preservation & Storage"),
+        (["cooking", "recipe", "nutrition", "food preparation", "baking",
+          "food production", "meal"], "Food Systems"),
+        (["first aid", "medical", "trauma", "surgery", "anatomy", "pharmacology",
+          "wound", "triage", "diagnosis", "disease", "infection", "veterinary",
+          "herbal medicine", "medicinal plant"], "Medical"),
+        (["radio", "antenna", "ham radio", "communication", "signal",
+          "networking", "meshtastic", "comms"], "Communications"),
+        (["solar", "battery", "generator", "wind turbine", "hydroelectric",
+          "power grid", "inverter", "photovoltaic", "electricity"], "Power Systems"),
+        (["water purification", "water filter", "well", "rainwater",
+          "sanitation", "water treatment", "desalination"], "Water Systems"),
+        (["navigation", "compass", "map reading", "gps", "celestial",
+          "orienteering", "land nav"], "Navigation"),
+        (["security", "opsec", "perimeter", "surveillance", "threat",
+          "intrusion detection", "physical security"], "Security"),
+        (["vehicle", "engine", "motor", "aircraft", "boat", "motorcycle",
+          "truck", "maintenance", "diesel", "transmission"], "Vehicles"),
+        (["tool", "equipment", "wrench", "saw", "drill", "hammer",
+          "hand tool", "power tool", "blade", "sharpening"], "Tools & Equipment"),
+        (["construction", "building", "shelter", "carpentry", "masonry",
+          "roofing", "concrete", "framing", "plumbing"], "Shelter & Construction"),
+        (["electronics", "computer", "software", "circuit", "programming",
+          "technology", "digital", "engineering"], "Technology"),
+        (["supply chain", "logistics", "transport", "distribution",
+          "inventory", "supply", "stockpile"], "Logistics"),
+        (["governance", "civil", "community", "administration", "organization",
+          "council", "democratic", "municipal"], "Civil Organization"),
+        (["tactics", "combat", "military", "mission", "patrol", "ambush",
+          "defensive position", "fire team", "maneuver", "engagement",
+          "search and rescue", "sar", "reconnaissance"], "Operations"),
+    ]
+
+    for keywords, domain in mapping:
+        if any(kw in text for kw in keywords):
+            return domain
+
+    return DOMAIN_FALLBACK
+
+
+# ── Checkpoint ──────────────────────────────────────────────────────────────
+
+class Checkpoint:
+    """Thread-safe checkpoint tracker for crash recovery."""
+    def __init__(self, path):
+        self.path = path
+        self._lock = threading.Lock()
+        self._completed = set()
+        self._dirty = 0
+        self._load()
+
+    def _load(self):
+        if self.path.exists():
+            try:
+                data = json.loads(self.path.read_text())
+                self._completed = set(data.get("completed", []))
+                log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
+            except Exception:
+                self._completed = set()
+
+    def is_done(self, point_id):
+        return point_id in self._completed
+
+    def mark_done(self, point_id):
+        with self._lock:
+            self._completed.add(point_id)
+            self._dirty += 1
+            if self._dirty >= 1000:
+                self._flush()
+
+    def _flush(self):
+        tmp = self.path.with_suffix('.tmp')
+        tmp.write_text(json.dumps({"completed": list(self._completed)}))
+        tmp.rename(self.path)
+        self._dirty = 0
+
+    def flush(self):
+        with self._lock:
+            self._flush()
+
+    def count(self):
+        return len(self._completed)
+
+
+# ── Per-point processing ───────────────────────────────────────────────────
+
+def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run, stats):
+    point_id = point.id
+    if checkpoint.is_done(point_id):
+        return "skipped"
+
+    payload = point.payload
+    content = payload.get("content", payload.get("summary", ""))
+    summary = payload.get("summary", "")
+    subdomains = payload.get("subdomain", [])
+    if isinstance(subdomains, str):
+        subdomains = [subdomains]
+    old_domain = payload.get("domain", [])
+    if isinstance(old_domain, list):
+        old_domain_str = old_domain[0] if old_domain else "(empty)"
+    else:
+        old_domain_str = str(old_domain)
+
+    key = key_rotator.next()
+    new_domain = classify_domain(content, summary, subdomains, key)
+
+    # Track the mapping
+    stats_key = f"{old_domain_str} -> {new_domain}"
+    stats[stats_key] = stats.get(stats_key, 0) + 1
+
+    if dry_run:
+        return f"would: {old_domain_str} -> {new_domain}"
+
+    # Write new domain as single string
+    qdrant.set_payload(
+        collection_name=collection,
+        payload={"domain": new_domain},
+        points=[point_id],
+    )
+
+    checkpoint.mark_done(point_id)
+    return "ok"
+
+
+# ── Main loop ───────────────────────────────────────────────────────────────
+
+SCROLL_BATCH = 5000
+
+
+def count_source_domains(qdrant, collection):
+    """Count vectors with source domains."""
+    counts = {}
+    for domain in SOURCE_DOMAINS:
+        result = qdrant.count(
+            collection_name=collection,
+            count_filter=Filter(
+                must=[FieldCondition(key="domain", match=MatchValue(value=domain))]
+            ),
+            exact=True,
+        )
+        counts[domain] = result.count
+    return counts
+
+
+def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None, dry_run=False):
+    """Scroll source domains in batches, process with thread pool."""
+    lock = threading.Lock()
+    done = 0
+    skipped_checkpoint = 0
+    start = time.time()
+    stats = {}  # shared mapping stats
+
+    for source_domain in sorted(SOURCE_DOMAINS):
+        log.info(f"\n--- Processing domain: {source_domain} ---")
+        offset = None
+        domain_done = 0
+
+        while True:
+            scroll_results, offset = qdrant.scroll(
+                collection_name=collection,
+                limit=SCROLL_BATCH,
+                with_payload=True,
+                with_vectors=False,
+                offset=offset,
+                scroll_filter=Filter(
+                    must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
+                ),
+            )
+
+            if not scroll_results:
+                if offset is None:
+                    break
+                continue
+
+            # Filter already checkpointed
+            pending = [p for p in scroll_results if not checkpoint.is_done(p.id)]
+            skipped_checkpoint += len(scroll_results) - len(pending)
+
+            if pending:
+                with ThreadPoolExecutor(max_workers=workers) as ex:
+                    futures = {
+                        ex.submit(process_point, p, qdrant, collection, rotator,
+                                  checkpoint, dry_run, stats): p
+                        for p in pending
+                    }
+                    for future in as_completed(futures):
+                        try:
+                            future.result()
+                        except Exception as e:
+                            log.error(f"Worker error: {e}")
+                        with lock:
+                            done += 1
+                            domain_done += 1
+                            if done % 5000 == 0:
+                                elapsed = time.time() - start
+                                rate = done / elapsed * 60
+                                log.info(f"  {done:,} done | {rate:.0f}/min | "
+                                         f"elapsed {elapsed/60:.1f}min")
+                                checkpoint.flush()
+                        time.sleep(0.02)
+
+            if limit and done >= limit:
+                break
+            if offset is None:
+                break
+
+        log.info(f"  {source_domain}: {domain_done:,} vectors processed")
+
+        if limit and done >= limit:
+            break
+
+    checkpoint.flush()
+    return done, skipped_checkpoint, stats, start
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Classify 20 samples without writing")
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    qdrant = QdrantClient(host="localhost", port=6333, timeout=120)
+    collection = "recon_knowledge"
+    checkpoint = Checkpoint(CHECKPOINT_FILE)
+
+    # Count source domains
+    counts = count_source_domains(qdrant, collection)
+    total_source = sum(counts.values())
+    pre_checkpoint = checkpoint.count()
+
+    log.info(f"Source domain counts:")
+    for domain, count in sorted(counts.items(), key=lambda x: -x[1]):
+        log.info(f"  {domain:30s} {count:>10,}")
+    log.info(f"  {'TOTAL':30s} {total_source:>10,}")
+    log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
+    log.info(f"Workers: {args.workers} | Keys: {len(keys)}")
+
+    # Cost estimate
+    remaining = total_source - pre_checkpoint
+    input_tokens = remaining * 200
+    output_tokens = remaining * 5
+    input_cost = input_tokens / 1_000_000 * 0.10
+    output_cost = output_tokens / 1_000_000 * 0.40
+    total_cost = input_cost + output_cost
+    log.info(f"\nEstimated Gemini 2.0 Flash cost:")
+    log.info(f"  Vectors to process: {remaining:,}")
+    log.info(f"  Input:  ~{input_tokens/1_000_000:.1f}M tokens = ${input_cost:.2f}")
+    log.info(f"  Output: ~{output_tokens/1_000_000:.1f}M tokens = ${output_cost:.2f}")
+    log.info(f"  TOTAL:  ~${total_cost:.2f}")
+
+    if args.dry_run:
+        log.info(f"\nDRY RUN: classifying 20 samples...\n")
+        for source_domain in sorted(SOURCE_DOMAINS):
+            scroll_results, _ = qdrant.scroll(
+                collection_name=collection,
+                limit=5,
+                with_payload=True,
+                with_vectors=False,
+                scroll_filter=Filter(
+                    must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
+                ),
+            )
+            for p in scroll_results[:4]:
+                pay = p.payload
+                title = pay.get("title", "(no title)")
+                content = pay.get("content", pay.get("summary", ""))
+                summary = pay.get("summary", "")
+                subdomains = pay.get("subdomain", [])
+                if isinstance(subdomains, str):
+                    subdomains = [subdomains]
+
+                key = rotator.next()
+                new_domain = classify_domain(content, summary, subdomains, key)
+
+                old = pay.get("domain", [])
+                if isinstance(old, list):
+                    old = old[0] if old else "?"
+                print(f"  [{old:25s}] -> [{new_domain:25s}]  {title[:60]}")
+
+        print(f"\nDRY RUN complete. ~{remaining:,} vectors would be migrated.")
+        print(f"Estimated cost: ~${total_cost:.2f}")
+        return
+
+    # ── Full migration ──────────────────────────────────────────────────
+    log.info(f"\nStarting full migration...")
+
+    done, skipped_ckpt, stats, start = stream_and_process(
+        qdrant, collection, rotator, checkpoint, args.workers, args.limit
+    )
+
+    elapsed = time.time() - start
+    log.info(f"\n{'='*70}")
+    log.info(f"MIGRATION COMPLETE in {elapsed/60:.1f}min:")
+    log.info(f"  Processed:            {done:,}")
+    log.info(f"  Skipped (checkpoint): {skipped_ckpt:,}")
+    log.info(f"  Rate:                 {done/elapsed*60:.0f}/min")
+    log.info(f"\nMapping distribution:")
+    for mapping, count in sorted(stats.items(), key=lambda x: -x[1])[:30]:
+        log.info(f"  {mapping:<55s} {count:>8,}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/migrate_skill_level.py
+++ b/scripts/migrate_skill_level.py
@ -0,0 +1,469 @@
+#!/usr/bin/env python3
+"""
+migrate_skill_level.py — Replaces skill_level with knowledge_type + complexity
+on all vectors in Qdrant and on-disk concept JSONs.
+
+Scrolls entire collection, classifies each concept via Gemini Flash,
+writes knowledge_type + complexity, deletes skill_level.
+
+Crash-safe: completed point IDs tracked in checkpoint file.
+
+Usage:
+  python3 /opt/recon/scripts/migrate_skill_level.py [--dry-run] [--workers 16] [--limit N]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+from qdrant_client import QdrantClient
+from qdrant_client.models import FieldCondition, MatchValue, Filter
+
+import sys
+sys.path.insert(0, '/opt/recon')
+from lib.utils import get_config, setup_logging
+
+# Suppress noisy HTTP request logging from qdrant_client/httpx
+import logging as _logging
+_logging.getLogger("httpx").setLevel(_logging.WARNING)
+_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
+
+LOG_FILE = Path("/opt/recon/logs/migrate_skill_level.log")
+CHECKPOINT_FILE = Path("/opt/recon/data/migrate_skill_level_checkpoint.json")
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("migrate_skill_level")
+
+# ── Prompt ──────────────────────────────────────────────────────────────────
+
+CLASSIFY_PROMPT = """\
+You are a knowledge classification engine. Given a concept, assign two fields:
+
+knowledge_type — what KIND of knowledge this is:
+  foundational — concepts, definitions, theory, background knowledge, explanations of how things work
+  procedural — step-by-step techniques, instructions, how-to skills, methods you execute
+  operational — application under real conditions, decision-making, mission execution, judgment calls in context
+
+complexity — how much prior knowledge is needed:
+  basic — requires little or no prior knowledge, introductory material, simple concepts
+  intermediate — requires some domain familiarity, assumes foundational knowledge is in place
+  advanced — requires significant experience or expertise, high-stakes or highly technical material
+
+EXAMPLES:
+- "Needle chest decompression procedure" → procedural, advanced
+- "What is soil texture and why does it matter" → foundational, basic
+- "Coordinating a fire team withdrawal under contact" → operational, advanced
+- "How to start a campfire with a ferro rod" → procedural, basic
+- "Antenna gain and radiation patterns explained" → foundational, intermediate
+- "Triage decision-making in a mass casualty event" → operational, advanced
+- "Step-by-step: building a Dakota fire hole" → procedural, intermediate
+- "Understanding the water cycle" → foundational, basic
+
+Concept title: {title}
+Concept domain: {domain}
+Concept subdomain: {subdomain}
+Concept content: {content}
+
+Return ONLY valid JSON, no markdown, no explanation:
+{{"knowledge_type": "foundational|procedural|operational", "complexity": "basic|intermediate|advanced"}}
+"""
+
+VALID_KNOWLEDGE_TYPES = {"foundational", "procedural", "operational"}
+VALID_COMPLEXITIES = {"basic", "intermediate", "advanced"}
+
+# ── Key management ──────────────────────────────────────────────────────────
+
+def load_gemini_keys():
+    keys = []
+    for line in Path("/opt/recon/.env").read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+# ── Classification ──────────────────────────────────────────────────────────
+
+def classify(title, domains, subdomains, content, key):
+    """Call Gemini Flash to classify knowledge_type + complexity."""
+    prompt = CLASSIFY_PROMPT.format(
+        title=title or "(untitled)",
+        domain=", ".join(domains[:5]) if domains else "(none)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=str(content)[:400] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for retry in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            kt = data.get("knowledge_type", "").lower().strip()
+            cx = data.get("complexity", "").lower().strip()
+            if kt in VALID_KNOWLEDGE_TYPES and cx in VALID_COMPLEXITIES:
+                return kt, cx
+            # Invalid values — retry once
+            if retry == 0:
+                continue
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
+            else:
+                break
+
+    # Fallback heuristic based on old skill_level + content analysis
+    return heuristic_fallback(title, subdomains, content)
+
+
+def heuristic_fallback(title, subdomains, content):
+    """Last-resort heuristic when Gemini fails."""
+    text = f"{title} {' '.join(subdomains)} {str(content)[:200]}".lower()
+
+    # Knowledge type heuristic
+    procedural_signals = ["how to", "step-by-step", "procedure", "instructions",
+                          "method", "technique", "build", "make", "construct",
+                          "install", "assemble", "recipe", "prepare"]
+    operational_signals = ["decision", "coordinate", "execute", "deploy",
+                           "mission", "triage", "under fire", "in the field",
+                           "real-world", "scenario", "assessment", "plan"]
+
+    if any(s in text for s in operational_signals):
+        kt = "operational"
+    elif any(s in text for s in procedural_signals):
+        kt = "procedural"
+    else:
+        kt = "foundational"
+
+    # Complexity heuristic — default intermediate (safest middle ground)
+    cx = "intermediate"
+    basic_signals = ["introduction", "what is", "basic", "beginner", "overview",
+                     "definition", "simple", "fundamentals"]
+    advanced_signals = ["advanced", "expert", "complex", "critical", "high-stakes",
+                        "surgery", "trauma", "tactical", "classified"]
+    if any(s in text for s in basic_signals):
+        cx = "basic"
+    elif any(s in text for s in advanced_signals):
+        cx = "advanced"
+
+    return kt, cx
+
+# ── Checkpoint management ───────────────────────────────────────────────────
+
+class Checkpoint:
+    """Thread-safe checkpoint tracker for crash recovery."""
+    def __init__(self, path):
+        self.path = path
+        self._lock = threading.Lock()
+        self._completed = set()
+        self._dirty = 0
+        self._load()
+
+    def _load(self):
+        if self.path.exists():
+            try:
+                data = json.loads(self.path.read_text())
+                self._completed = set(data.get("completed", []))
+                log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
+            except Exception:
+                self._completed = set()
+
+    def is_done(self, point_id):
+        return point_id in self._completed
+
+    def mark_done(self, point_id):
+        with self._lock:
+            self._completed.add(point_id)
+            self._dirty += 1
+            if self._dirty >= 1000:
+                self._flush()
+
+    def _flush(self):
+        tmp = self.path.with_suffix('.tmp')
+        tmp.write_text(json.dumps({"completed": list(self._completed)}))
+        tmp.rename(self.path)
+        self._dirty = 0
+
+    def flush(self):
+        with self._lock:
+            self._flush()
+
+    def count(self):
+        return len(self._completed)
+
+# ── Concept JSON update ────────────────────────────────────────────────────
+
+def update_concept_json(doc_hash, title, knowledge_type, complexity):
+    """Update on-disk concept JSON: add knowledge_type + complexity, remove skill_level."""
+    doc_dir = CONCEPTS_DIR / doc_hash
+    if not doc_dir.exists():
+        return False
+    for wf in doc_dir.glob("window_*.json"):
+        try:
+            with open(wf, "r", encoding="utf-8") as f:
+                concepts = json.load(f)
+            changed = False
+            for c in concepts:
+                if not isinstance(c, dict):
+                    continue
+                if c.get("title") == title:
+                    c["knowledge_type"] = knowledge_type
+                    c["complexity"] = complexity
+                    c.pop("skill_level", None)
+                    changed = True
+            if changed:
+                with open(wf, "w", encoding="utf-8") as f:
+                    json.dump(concepts, f, indent=2, ensure_ascii=False)
+                return True
+        except Exception:
+            pass
+    return False
+
+# ── Per-point processing ───────────────────────────────────────────────────
+
+def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run):
+    point_id = point.id
+    if checkpoint.is_done(point_id):
+        return "skipped"
+
+    payload = point.payload
+    title = payload.get("title", "")
+    domains = payload.get("domain", [])
+    if isinstance(domains, str):
+        domains = [domains]
+    subdomains = payload.get("subdomain", [])
+    if isinstance(subdomains, str):
+        subdomains = [subdomains]
+    content = payload.get("content", payload.get("summary", ""))
+    doc_hash = payload.get("doc_hash", "")
+
+    key = key_rotator.next()
+    knowledge_type, complexity = classify(title, domains, subdomains, content, key)
+
+    if dry_run:
+        return f"kt={knowledge_type}, cx={complexity}"
+
+    # Write new fields
+    qdrant.set_payload(
+        collection_name=collection,
+        payload={"knowledge_type": knowledge_type, "complexity": complexity},
+        points=[point_id],
+    )
+
+    # Delete old field
+    qdrant.delete_payload(
+        collection_name=collection,
+        keys=["skill_level"],
+        points=[point_id],
+    )
+
+    # Update JSON on disk
+    if doc_hash:
+        update_concept_json(doc_hash, title, knowledge_type, complexity)
+
+    checkpoint.mark_done(point_id)
+    return "ok"
+
+# ── Streaming batch processor ───────────────────────────────────────────────
+
+SCROLL_BATCH = 5000  # vectors per scroll batch — keeps memory bounded (~50MB)
+
+
+def count_collection(qdrant, collection):
+    """Quick count of total vectors via collection info."""
+    info = qdrant.get_collection(collection)
+    return info.points_count
+
+
+def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None):
+    """Scroll in batches, process each batch with thread pool, then discard.
+
+    Memory-bounded: only holds SCROLL_BATCH payloads at any time (~50MB).
+    """
+    results_agg = defaultdict(int)
+    lock = threading.Lock()
+    done = 0
+    skipped_checkpoint = 0
+    skipped_no_skill = 0
+    total_estimate = count_collection(qdrant, collection)
+    start = time.time()
+
+    offset = None
+    batch_num = 0
+
+    while True:
+        batch_num += 1
+        scroll_results, offset = qdrant.scroll(
+            collection_name=collection,
+            limit=SCROLL_BATCH,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+
+        # Filter to points needing migration
+        pending = []
+        for p in scroll_results:
+            if "skill_level" not in p.payload:
+                skipped_no_skill += 1
+                continue
+            if checkpoint.is_done(p.id):
+                skipped_checkpoint += 1
+                continue
+            pending.append(p)
+
+        if pending:
+            with ThreadPoolExecutor(max_workers=workers) as ex:
+                futures = {
+                    ex.submit(process_point, p, qdrant, collection, rotator, checkpoint, False): p
+                    for p in pending
+                }
+                for future in as_completed(futures):
+                    try:
+                        status = future.result()
+                    except Exception as e:
+                        status = f"error: {str(e)[:80]}"
+                        log.error(f"Worker error: {e}")
+                    with lock:
+                        results_agg[status] += 1
+                        done += 1
+                        if done % 5000 == 0:
+                            elapsed = time.time() - start
+                            rate = done / elapsed * 60
+                            remaining = total_estimate - done - skipped_checkpoint - skipped_no_skill
+                            eta = remaining / (done / elapsed) / 60 if done > 0 else 0
+                            log.info(f"  {done:,} done | {rate:.0f}/min | "
+                                     f"ETA ~{eta:.0f}min | {dict(results_agg)}")
+                            checkpoint.flush()
+                    time.sleep(0.02)
+
+        if limit and done >= limit:
+            break
+        if offset is None:
+            break
+
+    checkpoint.flush()
+    return done, skipped_checkpoint, skipped_no_skill, results_agg, start
+
+
+# ── Main ────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Classify 20 samples without writing anything")
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    config = get_config()
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    qdrant = QdrantClient(
+        host=config['vector_db']['host'],
+        port=config['vector_db']['port'],
+        timeout=120
+    )
+    collection = config['vector_db']['collection']
+    checkpoint = Checkpoint(CHECKPOINT_FILE)
+
+    total_vectors = count_collection(qdrant, collection)
+    pre_checkpoint = checkpoint.count()
+
+    log.info(f"Collection has {total_vectors:,} vectors")
+    log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
+    log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
+    log.info(f"Estimated Gemini Flash cost: ~${(total_vectors - pre_checkpoint) * 0.0004:.2f}")
+    log.info(f"Streaming in batches of {SCROLL_BATCH:,} (memory-bounded)")
+
+    if args.dry_run:
+        # Scroll one batch, classify 20 diverse samples
+        log.info(f"\nDRY RUN: classifying 20 samples...\n")
+        scroll_results, _ = qdrant.scroll(
+            collection_name=collection,
+            limit=200,
+            with_payload=True,
+            with_vectors=False,
+        )
+        samples = []
+        seen_domains = set()
+        for p in scroll_results:
+            if "skill_level" not in p.payload:
+                continue
+            domains = p.payload.get("domain", [])
+            if isinstance(domains, str):
+                domains = [domains]
+            d_key = tuple(sorted(domains[:2]))
+            if d_key not in seen_domains:
+                samples.append(p)
+                seen_domains.add(d_key)
+            if len(samples) >= 20:
+                break
+
+        for i, p in enumerate(samples, 1):
+            pay = p.payload
+            title = pay.get("title", "(no title)")
+            domains = pay.get("domain", [])
+            old_skill = pay.get("skill_level", "?")
+            subdomains = pay.get("subdomain", [])
+            if isinstance(subdomains, str):
+                subdomains = [subdomains]
+            content = pay.get("content", pay.get("summary", ""))
+
+            key = rotator.next()
+            kt, cx = classify(title, domains, subdomains, content, key)
+
+            print(f"\n--- Sample {i}/{len(samples)} ---")
+            print(f"  Title:          {title}")
+            print(f"  Domain:         {domains}")
+            print(f"  Old skill:      {old_skill}")
+            print(f"  → knowledge_type: {kt}")
+            print(f"  → complexity:     {cx}")
+        est = total_vectors - pre_checkpoint
+        print(f"\nDRY RUN complete. ~{est:,} vectors would be migrated.")
+        print(f"Estimated Gemini Flash cost: ~${est * 0.0004:.2f}")
+        return
+
+    # ── Full migration run (streaming) ──────────────────────────────────────
+    done, skipped_ckpt, skipped_no_skill, results, start = stream_and_process(
+        qdrant, collection, rotator, checkpoint, args.workers, args.limit
+    )
+
+    elapsed = time.time() - start
+    log.info(f"\nComplete in {elapsed/60:.1f}min:")
+    log.info(f"  Processed:           {done:,}")
+    log.info(f"  Skipped (checkpoint): {skipped_ckpt:,}")
+    log.info(f"  Skipped (no skill):   {skipped_no_skill:,}")
+    for status, count in sorted(results.items(), key=lambda x: -x[1]):
+        log.info(f"  {status:<30} {count:>10,}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/rebuild_qdrant.py
+++ b/scripts/rebuild_qdrant.py
@ -0,0 +1,227 @@
+"""
+RECON Qdrant Rebuilder — patched for headless parallel execution
+
+Deletes and recreates the Qdrant collection, then re-embeds ALL concept JSONs
+from disk using parallel workers. Pass --confirm to skip interactive prompt.
+
+Usage:
+  python3 scripts/rebuild_qdrant.py --confirm [--workers 8]
+"""
+
+import json
+import os
+import sys
+import time
+import argparse
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import requests as http_requests
+from qdrant_client import QdrantClient
+from qdrant_client.models import VectorParams, Distance, PointStruct
+
+from lib.utils import get_config, concept_id, setup_logging
+from lib.status import StatusDB
+
+logger = setup_logging('recon.rebuild')
+
+
+def embed_content(config, content):
+    try:
+        tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/embed"
+        resp = http_requests.post(tei_url, json={"inputs": content}, timeout=120)
+        resp.raise_for_status()
+        return resp.json()[0]
+    except Exception as tei_err:
+        logger.debug(f"TEI failed, trying Ollama: {tei_err}")
+
+    ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/embed"
+    resp = http_requests.post(ollama_url, json={
+        "model": config['embedding']['model'],
+        "input": content
+    }, timeout=120)
+    resp.raise_for_status()
+    return resp.json()['embeddings'][0]
+
+
+def process_doc(doc_hash, config, db, qdrant, collection):
+    """Embed and upsert all concepts for a single document. Returns (inserted, failed)."""
+    doc_dir = os.path.join(config['paths']['concepts'], doc_hash)
+    doc = db.get_document(doc_hash)
+    filename = doc['filename'] if doc else doc_hash[:8]
+
+    window_files = sorted([
+        f for f in os.listdir(doc_dir)
+        if f.startswith('window_') and f.endswith('.json')
+    ])
+
+    all_concepts = []
+    for wf in window_files:
+        path = os.path.join(doc_dir, wf)
+        try:
+            with open(path, encoding='utf-8') as f:
+                concepts = json.load(f)
+            if isinstance(concepts, list):
+                all_concepts.extend(concepts)
+        except Exception as e:
+            logger.warning(f"Skipping corrupted window {wf} in {doc_hash}: {e}")
+
+    if not all_concepts:
+        return 0, 0
+
+    is_web = doc.get('path', '').startswith(('http://', 'https://')) if doc else False
+
+    # Check meta.json for explicit source_type (e.g. 'transcript')
+    source_type = 'web' if is_web else 'document'
+    text_dir = os.path.join(config['paths']['text'], doc_hash)
+    meta_path = os.path.join(text_dir, 'meta.json')
+    if os.path.exists(meta_path):
+        try:
+            with open(meta_path) as mf:
+                meta = json.load(mf)
+            if meta.get('source_type'):
+                source_type = meta['source_type']
+        except Exception:
+            pass
+
+    points = []
+    failed = 0
+    batch_size = config['processing']['embed_batch_size']
+
+    for idx, concept in enumerate(all_concepts):
+        content = concept.get('content', '')
+        if not content or len(content.strip()) < 10:
+            continue
+        try:
+            vector = embed_content(config, content)
+        except Exception as e:
+            logger.warning(f"Embedding failed {doc_hash}:{idx}: {e}")
+            failed += 1
+            continue
+
+        start_page = concept.get('_start_page', 0)
+        point_id = concept_id(doc_hash, start_page, idx)
+
+        payload = {
+            'doc_hash': doc_hash,
+            'filename': filename,
+            'book_title': doc.get('book_title', '') if doc else '',
+            'book_author': doc.get('book_author', '') if doc else '',
+            'source_type': source_type,
+            'verification_status': 'unverified',
+            'credibility_score': 0.7,
+            'language': 'en',
+        }
+        for field in ['content', 'summary', 'title', 'domain', 'subdomain',
+                      'keywords', 'skill_level', 'key_facts', 'scenario_applicable',
+                      'cross_domain_tags', 'chapter', 'page_ref', 'notes',
+                      '_window', '_start_page']:
+            if field in concept:
+                payload[field] = concept[field]
+
+        points.append(PointStruct(id=point_id, vector=vector, payload=payload))
+
+        if len(points) >= batch_size:
+            qdrant.upsert(collection_name=collection, points=points)
+            points = []
+
+    if points:
+        qdrant.upsert(collection_name=collection, points=points)
+
+    inserted = len(all_concepts) - failed
+    if doc:
+        db.update_status(doc_hash, 'complete', vectors_inserted=inserted)
+
+    return inserted, failed
+
+
+def run_rebuild(workers=8):
+    config = get_config()
+    db = StatusDB()
+
+    qdrant = QdrantClient(
+        host=config['vector_db']['host'],
+        port=config['vector_db']['port'],
+        timeout=60
+    )
+    collection = config['vector_db']['collection']
+
+    # Delete and recreate
+    try:
+        qdrant.delete_collection(collection)
+        logger.info(f"Deleted collection: {collection}")
+    except Exception:
+        pass
+
+    qdrant.create_collection(
+        collection_name=collection,
+        vectors_config=VectorParams(
+            size=config['embedding']['dimensions'],
+            distance=Distance.COSINE
+        )
+    )
+    logger.info(f"Created collection: {collection} ({config['embedding']['dimensions']}d, Cosine)")
+
+    concepts_root = config['paths']['concepts']
+    doc_dirs = sorted([
+        d for d in os.listdir(concepts_root)
+        if os.path.isdir(os.path.join(concepts_root, d))
+    ])
+    logger.info(f"Found {len(doc_dirs)} document concept directories | {workers} workers")
+
+    total_inserted = 0
+    total_failed = 0
+    done = 0
+    lock = threading.Lock()
+    start = time.time()
+
+    with ThreadPoolExecutor(max_workers=workers) as ex:
+        futures = {
+            ex.submit(process_doc, h, config, StatusDB(), qdrant, collection): h
+            for h in doc_dirs
+        }
+        for future in as_completed(futures):
+            doc_hash = futures[future]
+            try:
+                inserted, failed = future.result()
+            except Exception as e:
+                logger.error(f"Worker error {doc_hash}: {e}")
+                inserted, failed = 0, 0
+
+            with lock:
+                total_inserted += inserted
+                total_failed += failed
+                done += 1
+                if done % 500 == 0:
+                    elapsed = time.time() - start
+                    rate = total_inserted / elapsed if elapsed > 0 else 0
+                    remaining = (len(doc_dirs) - done) / (done / elapsed) if elapsed > 0 else 0
+                    logger.info(
+                        f"  [{done}/{len(doc_dirs)}] "
+                        f"{total_inserted:,} vectors | "
+                        f"{rate:.0f}/sec | "
+                        f"ETA {remaining/60:.0f}min"
+                    )
+
+    elapsed = time.time() - start
+    logger.info(f"\nRebuild complete in {elapsed/60:.1f} min: "
+                f"{total_inserted:,} inserted, {total_failed:,} failed")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--confirm', action='store_true', help='Skip interactive prompt')
+    parser.add_argument('--workers', type=int, default=8)
+    args = parser.parse_args()
+
+    if not args.confirm:
+        print("WARNING: This will DELETE and RECREATE the Qdrant collection.")
+        confirm = input("Type 'REBUILD' to proceed: ")
+        if confirm != 'REBUILD':
+            print("Aborted.")
+            sys.exit(0)
+
+    run_rebuild(workers=args.workers)
--- a/scripts/reenrich_reference.py
+++ b/scripts/reenrich_reference.py
@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""
+reenrich_reference.py — Re-classifies all remaining Reference-tagged concepts.
+
+Scrolls Qdrant for vectors with domain == ["Reference"] or containing "Reference",
+calls Gemini with a hardened prompt that rejects Reference as a valid response,
+updates both Qdrant payload and concept JSON on disk.
+
+Usage:
+  python3 /opt/recon/scripts/reenrich_reference.py [--dry-run] [--workers 16] [--limit N]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+from qdrant_client import QdrantClient
+from qdrant_client.models import FieldCondition, MatchAny, Filter
+
+import sys
+sys.path.insert(0, '/opt/recon')
+from lib.utils import get_config, setup_logging
+
+LOG_FILE = Path("/opt/recon/logs/reenrich_reference.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("reenrich_reference")
+
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+
+CANONICAL_DOMAINS = {
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+}
+
+# Hardened prompt — Reference explicitly forbidden, classification rules detailed
+CLASSIFY_PROMPT = """\
+You are a knowledge classification engine. Classify this concept into its correct domain.
+
+VALID DOMAINS — use ONLY these exact strings:
+  Defense & Tactics
+  Sustainment Systems
+  Off-Grid Systems
+  Foundational Skills
+  Communications
+  Medical
+  Food Systems
+  Navigation
+  Logistics
+  Power Systems
+  Leadership
+  Scenario Playbooks
+  Water Systems
+  Security
+  Community Coordination
+
+FORBIDDEN: Do NOT output "Reference" under any circumstances. It is not a valid domain.
+FORBIDDEN: Do NOT output an empty domain list.
+
+CLASSIFICATION RULES:
+- First aid, anatomy, pharmacology, herbs, veterinary, austere medicine, wound care → Medical
+- Food growing, foraging, hunting, fishing, animal husbandry, livestock → Sustainment Systems
+- Food preservation, canning, fermentation, food storage, dehydrating → Food Systems
+- Solar, wind, hydro, batteries, generators, inverters, charge controllers → Power Systems
+- Water sourcing, filtration, purification, sanitation, wells, rainwater → Water Systems
+- Radio, antennas, mesh networking, SIGINT, amateur radio → Communications
+- Weapons, tactics, NBC, security operations, field craft → Defense & Tactics
+- Permaculture, soil science, agroforestry, composting → Sustainment Systems
+- Shelter, construction, masonry, blacksmithing, woodworking, crafts → Foundational Skills
+- Navigation, land nav, celestial nav, map reading, compass → Navigation
+- Emergency planning, disaster prep, scenario planning → Scenario Playbooks
+- Leadership, governance, community organization → Leadership
+- Supply chain, transportation, inventory → Logistics
+- Physical security, perimeter, surveillance → Security
+- Community building, cooperation, mutual aid → Community Coordination
+- Biogas, wood gasification, rocket stoves, appropriate technology → Off-Grid Systems
+
+If uncertain between two domains, pick the most actionable one for a self-reliant household.
+
+Concept title: {title}
+Concept subdomain tags: {subdomain}
+Concept content: {content}
+
+Return ONLY valid JSON, no markdown, no explanation:
+{{"domain": ["Domain Name"]}}
+"""
+
+def load_gemini_keys():
+    keys = []
+    for line in Path("/opt/recon/.env").read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def classify(title, subdomains, content, key, attempt=0):
+    """Call Gemini. Rejects Reference. Falls back to subdomain heuristic if needed."""
+    prompt = CLASSIFY_PROMPT.format(
+        title=title or "(untitled)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=str(content)[:400] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for retry in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            domains = [
+                d for d in data.get("domain", [])
+                if d in CANONICAL_DOMAINS  # strips Reference automatically
+            ]
+            if domains:
+                return domains
+            # Gemini returned Reference or empty — try once more with stronger wording
+            if retry == 0:
+                continue
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
+            else:
+                break
+
+    # Last resort: subdomain keyword heuristic
+    return subdomain_fallback(subdomains)
+
+SUBDOMAIN_FALLBACK_MAP = [
+    (["first aid", "trauma", "wound", "anatomy", "pharmacol", "herbal", "medicin", "veterinar", "dental", "surgery"], "Medical"),
+    (["foraging", "hunting", "fishing", "livestock", "permaculture", "soil", "agroforestry", "mycolog", "mushroom"], "Sustainment Systems"),
+    (["canning", "preservation", "fermentation", "food storage", "dehydrat"], "Food Systems"),
+    (["solar", "battery", "generator", "inverter", "wind turbine", "photovoltaic"], "Power Systems"),
+    (["water purif", "filtration", "sanitation", "well", "rainwater"], "Water Systems"),
+    (["radio", "antenna", "mesh", "sigint", "amateur radio", "meshtastic"], "Communications"),
+    (["weapon", "firearm", "tactic", "nbc", "chemical warfare", "ballistic"], "Defense & Tactics"),
+    (["navigation", "compass", "land nav", "celestial"], "Navigation"),
+    (["blacksmith", "woodwork", "masonry", "construct", "craft", "pottery"], "Foundational Skills"),
+    (["biogas", "gasif", "rocket stove", "appropriate tech"], "Off-Grid Systems"),
+    (["disaster", "emergency prep", "evacuation", "scenario"], "Scenario Playbooks"),
+    (["leadership", "governance", "community"], "Leadership"),
+    (["logistics", "supply chain", "transport"], "Logistics"),
+    (["security", "perimeter", "surveillance"], "Security"),
+]
+
+def subdomain_fallback(subdomains):
+    combined = " ".join(s.lower() for s in subdomains)
+    for keywords, domain in SUBDOMAIN_FALLBACK_MAP:
+        if any(kw in combined for kw in keywords):
+            return [domain]
+    return ["Foundational Skills"]  # absolute last resort
+
+def update_concept_json(doc_hash, title, new_domains):
+    """Update domain in concept JSON files on disk."""
+    doc_dir = CONCEPTS_DIR / doc_hash
+    if not doc_dir.exists():
+        return False
+    for wf in doc_dir.glob("window_*.json"):
+        try:
+            with open(wf, "r", encoding="utf-8") as f:
+                concepts = json.load(f)
+            changed = False
+            for c in concepts:
+                if not isinstance(c, dict):
+                    continue
+                if c.get("title") == title:
+                    raw = c.get("domain", [])
+                    if isinstance(raw, str):
+                        raw = [raw]
+                    if "Reference" in raw or not [d for d in raw if d in CANONICAL_DOMAINS]:
+                        c["domain"] = new_domains
+                        changed = True
+            if changed:
+                with open(wf, "w", encoding="utf-8") as f:
+                    json.dump(concepts, f, indent=2, ensure_ascii=False)
+                return True
+        except Exception:
+            pass
+    return False
+
+def process_point(point, qdrant, collection, key_rotator, dry_run):
+    payload = point.payload
+    title = payload.get("title", "")
+    subdomains = payload.get("subdomain", [])
+    if isinstance(subdomains, str):
+        subdomains = [subdomains]
+    content = payload.get("content", payload.get("summary", ""))
+    doc_hash = payload.get("doc_hash", "")
+
+    key = key_rotator.next()
+    new_domains = classify(title, subdomains, content, key)
+
+    if dry_run:
+        return "would_classify"
+
+    # Update Qdrant payload
+    qdrant.set_payload(
+        collection_name=collection,
+        payload={"domain": new_domains},
+        points=[point.id],
+    )
+
+    # Update JSON on disk
+    if doc_hash:
+        update_concept_json(doc_hash, title, new_domains)
+
+    return "ok"
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--workers", type=int, default=16)
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+
+    config = get_config()
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    qdrant = QdrantClient(
+        host=config['vector_db']['host'],
+        port=config['vector_db']['port'],
+        timeout=60
+    )
+    collection = config['vector_db']['collection']
+
+    log.info("Scrolling Qdrant for Reference-tagged concepts...")
+
+    # Scroll all points containing Reference in domain
+    offset = None
+    reference_points = []
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(
+                    key="domain",
+                    match=MatchAny(any=["Reference"])
+                )]
+            ),
+            limit=1000,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        reference_points.extend(results)
+        if offset is None:
+            break
+        if args.limit and len(reference_points) >= args.limit:
+            reference_points = reference_points[:args.limit]
+            break
+
+    total = len(reference_points)
+    log.info(f"Found {total:,} Reference-tagged vectors")
+    log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
+    log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f}")
+
+    if args.dry_run:
+        log.info(f"DRY RUN: would re-classify {total:,} concepts. Exiting.")
+        return
+
+    results = defaultdict(int)
+    lock = threading.Lock()
+    done = 0
+    start = time.time()
+
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futures = {
+            ex.submit(process_point, p, qdrant, collection, rotator, False): p
+            for p in reference_points
+        }
+        for future in as_completed(futures):
+            status = future.result()
+            with lock:
+                results[status] += 1
+                done += 1
+                if done % 5000 == 0:
+                    elapsed = time.time() - start
+                    rate = done / elapsed * 60
+                    eta = (total - done) / (done / elapsed) / 60
+                    log.info(f"  {done:,}/{total:,} | {rate:.0f}/min | ETA {eta:.0f}min | {dict(results)}")
+            time.sleep(0.02)
+
+    elapsed = time.time() - start
+    log.info(f"\nComplete in {elapsed/60:.1f}min:")
+    for status, count in sorted(results.items(), key=lambda x: -x[1]):
+        log.info(f"  {status:<20} {count:>10,}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/repair_corrupted.py
+++ b/scripts/repair_corrupted.py
@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+"""
+repair_corrupted.py — Repairs window files corrupted by concurrent writes.
+
+Strategy:
+  1. Read corrupted_windows.txt to get the list of bad files
+  2. For each bad file, identify the parent doc hash from the path
+  3. Check if the text directory still exists for that doc
+  4. If yes: re-run Gemini enrichment on just that window
+  5. If no text: mark as unrecoverable
+  6. Report summary
+
+Usage:
+  python3 /opt/recon/scripts/repair_corrupted.py [--dry-run] [--workers 8]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import re
+import threading
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+
+CORRUPTED_LIST = Path("/opt/recon/data/corrupted_windows.txt")
+TEXT_DIR = Path("/opt/recon/data/text")
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+LOG_FILE = Path("/opt/recon/logs/repair_corrupted.log")
+UNRECOVERABLE_LOG = Path("/opt/recon/data/unrecoverable_windows.txt")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(),
+    ]
+)
+log = logging.getLogger("repair_corrupted")
+
+CANONICAL_DOMAINS = [
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+]
+
+ENRICH_PROMPT = """Extract knowledge concepts from this document text.
+
+A concept is a SELF-CONTAINED piece of knowledge that can stand alone.
+
+For each concept, provide ALL fields:
+
+Required:
+- content: Full text of the concept (complete procedure, definition, etc.)
+- summary: 1-2 sentence summary
+- title: Brief descriptive title
+- domain: Array of 1-5 from ONLY these exact strings (no others):
+    Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
+    Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
+    Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
+  CRITICAL: Do NOT use "Reference". Every concept belongs somewhere specific.
+- subdomain: Array of specific subcategories (up to 10)
+- keywords: Array of 3-30 searchable terms
+- skill_level: novice | intermediate | advanced
+- key_facts: Array of specific extractable claims, measurements, data points
+
+Optional (include when present):
+- scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki
+- cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination
+- chapter: Chapter name if identifiable
+- page_ref: Page reference
+
+Return JSON array. If no extractable concepts, return [].
+
+Document text:
+"""
+
+def load_gemini_keys():
+    env = Path("/opt/recon/.env")
+    keys = []
+    for line in env.read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def repair_json_truncated(text):
+    """Last-ditch attempt to salvage a truncated JSON array."""
+    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
+    text = re.sub(r',\s*([}\]])', r'\1', text)
+    try:
+        return json.loads(text)
+    except Exception:
+        pass
+    # Find last complete object
+    last_close = -1
+    depth = 0
+    in_str = False
+    esc = False
+    for i, ch in enumerate(text):
+        if esc:
+            esc = False; continue
+        if ch == '\\' and in_str:
+            esc = True; continue
+        if ch == '"' and not esc:
+            in_str = not in_str; continue
+        if in_str:
+            continue
+        if ch == '{': depth += 1
+        elif ch == '}':
+            depth -= 1
+            if depth == 0:
+                last_close = i
+    if last_close > 0:
+        trimmed = text[:last_close + 1].rstrip().rstrip(',')
+        open_brackets = trimmed.count('[') - trimmed.count(']')
+        try:
+            return json.loads(trimmed + ']' * open_brackets)
+        except Exception:
+            pass
+    return None
+
+def enrich_window_text(text, key):
+    """Call Gemini on raw window text, return concepts list."""
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for attempt in range(4):
+        try:
+            resp = model.generate_content(ENRICH_PROMPT + text)
+            raw = resp.text
+            try:
+                result = json.loads(raw)
+            except Exception:
+                result = repair_json_truncated(raw)
+            if isinstance(result, list):
+                return [c for c in result if isinstance(c, dict)]
+            elif isinstance(result, dict):
+                return [result]
+            return []
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
+                delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
+                time.sleep(delay)
+            else:
+                log.warning(f"  Non-transient error: {e}")
+                break
+    return None  # failed
+
+def get_window_text(doc_hash, window_filename):
+    """Reconstruct window text from page files."""
+    # Window filename: window_NNNN.json -> window index is NNNN
+    try:
+        w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
+    except (IndexError, ValueError):
+        return None
+
+    text_path = TEXT_DIR / doc_hash
+    if not text_path.exists():
+        return None
+
+    page_files = sorted([
+        f for f in text_path.iterdir()
+        if f.name.startswith('page_') and f.name.endswith('.txt')
+    ])
+    if not page_files:
+        return None
+
+    # Re-derive which pages this window covered (window_size=5 from config)
+    window_size = 5
+    start = w_idx * window_size
+    window_pages = page_files[start:start + window_size]
+    if not window_pages:
+        return None
+
+    parts = []
+    for j, pf in enumerate(window_pages):
+        try:
+            text = pf.read_text(encoding='utf-8')
+            parts.append(f"--- Page {start + j + 1} ---\n{text}")
+        except Exception:
+            pass
+    return "\n\n".join(parts) if parts else None
+
+def repair_file(corrupted_path, key_rotator, dry_run):
+    """Attempt to repair a single corrupted window file."""
+    path = Path(corrupted_path)
+
+    # Sanity check -- maybe it fixed itself somehow
+    try:
+        with open(path) as f:
+            existing = json.load(f)
+        return "already_valid"
+    except Exception:
+        pass
+
+    # Extract doc hash and window name from path structure
+    # Expected: /opt/recon/data/concepts/{hash}/window_NNNN.json
+    doc_hash = path.parent.name
+    window_filename = path.name
+
+    # Get source text for this window
+    window_text = get_window_text(doc_hash, window_filename)
+    if not window_text:
+        return "no_source_text"
+
+    if dry_run:
+        return "would_repair"
+
+    # Re-enrich from source text
+    key = key_rotator.next()
+    concepts = enrich_window_text(window_text, key)
+
+    if concepts is None:
+        return "enrichment_failed"
+
+    # Tag concepts with metadata
+    try:
+        w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
+        window_size = 5
+        start_page = w_idx * window_size + 1
+    except Exception:
+        w_idx = 0
+        start_page = 0
+
+    for c in concepts:
+        c['_window'] = w_idx + 1
+        c['_start_page'] = start_page
+        c['_doc_hash'] = doc_hash
+        c['_repaired'] = True
+
+    # Write repaired file
+    try:
+        with open(path, 'w', encoding='utf-8') as f:
+            json.dump(concepts, f, indent=2, ensure_ascii=False)
+        return "repaired"
+    except Exception as e:
+        return "write_error"
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--workers", type=int, default=8)
+    args = parser.parse_args()
+
+    if not CORRUPTED_LIST.exists():
+        log.error(f"Corrupted list not found: {CORRUPTED_LIST}")
+        log.error("Run Task 1 first to generate it.")
+        return
+
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    corrupted = []
+    with open(CORRUPTED_LIST) as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            if parts:
+                corrupted.append(parts[0])
+
+    log.info(f"Repairing {len(corrupted):,} corrupted window files")
+    log.info(f"Dry run: {args.dry_run} | Workers: {args.workers} | Keys: {len(keys)}")
+
+    results = defaultdict(int)
+    unrecoverable = []
+    lock = threading.Lock()
+
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futures = {ex.submit(repair_file, p, rotator, args.dry_run): p for p in corrupted}
+        done = 0
+        for future in as_completed(futures):
+            path = futures[future]
+            status = future.result()
+            with lock:
+                results[status] += 1
+                if status in ("no_source_text", "enrichment_failed", "write_error"):
+                    unrecoverable.append((path, status))
+                done += 1
+                if done % 100 == 0:
+                    log.info(f"  {done:,}/{len(corrupted):,} | {dict(results)}")
+            time.sleep(0.05)
+
+    log.info("── Results ─────────────────────────────────────────────────")
+    for status, count in sorted(results.items(), key=lambda x: -x[1]):
+        log.info(f"  {status:<25} {count:>8,}")
+
+    if unrecoverable:
+        with open(UNRECOVERABLE_LOG, 'w') as f:
+            for path, reason in unrecoverable:
+                f.write(f"{path}\t{reason}\n")
+        log.info(f"\n  Unrecoverable: {len(unrecoverable)} — logged to {UNRECOVERABLE_LOG}")
+    else:
+        log.info("\n  All files repaired successfully.")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/validate.py
+++ b/scripts/validate.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+RECON Pipeline Validator
+
+Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.
+Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.
+
+Usage: python3 scripts/validate.py [--deep]
+"""
+
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from lib.utils import get_config, setup_logging
+from lib.status import StatusDB
+
+logger = setup_logging('recon.validate')
+
+
+def run_validation(deep=False):
+    config = get_config()
+    db = StatusDB()
+
+    issues = []
+    warnings = []
+
+    print("=== RECON Validation ===\n")
+
+    # Check paths
+    for name, path in config['paths'].items():
+        if name == 'db':
+            if not os.path.exists(path):
+                issues.append(f"Database not found: {path}")
+        else:
+            if not os.path.exists(path):
+                warnings.append(f"Directory missing: {name} = {path}")
+
+    # Check library
+    if not os.path.exists(config['library_root']):
+        issues.append(f"Library root not found: {config['library_root']}")
+
+    # Check Gemini keys
+    keys = config.get('gemini_keys', [])
+    if not keys:
+        warnings.append("No Gemini API keys configured in .env")
+    else:
+        print(f"  Gemini keys: {len(keys)} configured")
+
+    # DB status counts
+    counts = db.get_status_counts()
+    cat = counts.get('catalogue', {})
+    doc = counts.get('documents', {})
+
+    print(f"  Catalogue: {sum(cat.values())} entries")
+    print(f"  Documents: {sum(doc.values())} entries")
+    print(f"  Complete: {doc.get('complete', 0)}")
+    print(f"  Failed: {doc.get('failed', 0)}")
+
+    if deep:
+        print("\n--- Deep Validation ---\n")
+
+        # Check every document in pipeline has corresponding files
+        all_docs = db.get_all_documents()
+        text_dir = config['paths']['text']
+        concepts_dir = config['paths']['concepts']
+
+        for d in all_docs:
+            h = d['hash']
+            status = d['status']
+
+            if status in ('extracted', 'enriched', 'complete'):
+                doc_text_dir = os.path.join(text_dir, h)
+                if not os.path.exists(doc_text_dir):
+                    issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")
+                elif deep:
+                    pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]
+                    if not pages:
+                        issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")
+
+            if status in ('enriched', 'complete'):
+                doc_concepts_dir = os.path.join(concepts_dir, h)
+                if not os.path.exists(doc_concepts_dir):
+                    issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")
+                elif deep:
+                    windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]
+                    if not windows:
+                        issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")
+                    else:
+                        for wf in windows:
+                            try:
+                                with open(os.path.join(doc_concepts_dir, wf)) as f:
+                                    data = json.load(f)
+                                if not isinstance(data, list):
+                                    issues.append(f"[{h[:8]}] {wf}: not a JSON array")
+                            except json.JSONDecodeError:
+                                issues.append(f"[{h[:8]}] {wf}: invalid JSON")
+
+        # Check orphaned directories
+        if os.path.exists(text_dir):
+            doc_hashes = {d['hash'] for d in all_docs}
+            for dirname in os.listdir(text_dir):
+                if dirname not in doc_hashes:
+                    warnings.append(f"Orphaned text dir: {dirname}")
+
+        if os.path.exists(concepts_dir):
+            for dirname in os.listdir(concepts_dir):
+                if dirname not in doc_hashes:
+                    warnings.append(f"Orphaned concepts dir: {dirname}")
+
+        print(f"  Checked {len(all_docs)} documents")
+
+    # Connectivity checks
+    print("\n--- Connectivity ---\n")
+
+    import requests as http_requests
+
+    # Check TEI (primary embedding backend)
+    try:
+        tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"
+        resp = http_requests.get(tei_url, timeout=10)
+        if resp.status_code == 200:
+            print(f"  TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")
+        else:
+            issues.append(f"TEI: HTTP {resp.status_code}")
+    except Exception as e:
+        issues.append(f"TEI: unreachable ({e})")
+
+    # Check Ollama (fallback)
+    try:
+        ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"
+        resp = http_requests.get(ollama_url, timeout=10)
+        if resp.status_code == 200:
+            print(f"  Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")
+        else:
+            warnings.append(f"Ollama: HTTP {resp.status_code}")
+    except Exception as e:
+        warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")
+
+    try:
+        from qdrant_client import QdrantClient
+        qdrant = QdrantClient(
+            host=config['vector_db']['host'],
+            port=config['vector_db']['port'],
+            timeout=10
+        )
+        collections = [c.name for c in qdrant.get_collections().collections]
+        target = config['vector_db']['collection']
+        if target in collections:
+            info = qdrant.get_collection(target)
+            print(f"  Qdrant: OK ({target}: {info.points_count} points)")
+        else:
+            issues.append(f"Qdrant: collection {target} not found")
+    except Exception as e:
+        issues.append(f"Qdrant: unreachable ({e})")
+
+    # Summary
+    print("\n--- Summary ---\n")
+
+    if warnings:
+        print(f"Warnings ({len(warnings)}):")
+        for w in warnings:
+            print(f"  ⚠ {w}")
+
+    if issues:
+        print(f"\nIssues ({len(issues)}):")
+        for i in issues:
+            print(f"  ✗ {i}")
+        print(f"\nValidation FAILED: {len(issues)} issue(s)")
+    else:
+        print("Validation PASSED")
+
+
+if __name__ == '__main__':
+    deep = '--deep' in sys.argv
+    run_validation(deep=deep)