Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-14 14:57:23 +00:00
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions

0
scripts/__init__.py Normal file
View file

373
scripts/aa_download.py Executable file
View file

@ -0,0 +1,373 @@
#!/usr/bin/env python3
"""
aa_download.py Anna's Archive bulk downloader for RECON library acquisition.
For each target book:
1. Searches annas-archive.org for the title + author
2. Extracts the best PDF match (verified by author/page count)
3. Gets the MD5 from the book page
4. Attempts download from Libgen mirrors in order
5. Verifies downloaded file is a valid PDF
6. Writes full acquisition report
Usage:
python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]
Report output: ~/projects/recon/aa_acquisition_report.md
"""
import json
import time
import random
import hashlib
import logging
import argparse
from pathlib import Path
from datetime import datetime
import requests
from bs4 import BeautifulSoup
REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
LOG_FILE = Path("/opt/recon/logs/aa_download.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("aa_download")
SESSION = requests.Session()
SESSION.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Accept-Language": "en-US,en;q=0.9",
})
BASE_AA = "https://annas-archive.gl"
# Download attempt order — try fastest mirrors first
LIBGEN_MIRRORS = [
"https://libgen.is/get.php?md5={md5}",
"https://libgen.rs/get.php?md5={md5}",
"https://libgen.st/get.php?md5={md5}",
"https://libgen.li/ads.php?md5={md5}",
]
# ── Target book list ──────────────────────────────────────────────────────────
TARGETS = [
# (title, author, dest_dir)
# Medical — Herbalism
("Medical Herbalism", "David Hoffmann", "Medical/Herbalism"),
("Making Plant Medicine", "Richo Cech", "Medical/Herbalism"),
("The Earthwise Herbal Volume 1", "Matthew Wood", "Medical/Herbalism"),
("The Earthwise Herbal Volume 2", "Matthew Wood", "Medical/Herbalism"),
("Herbal Antibiotics", "Stephen Buhner", "Medical/Herbalism"),
("Herbal Antivirals", "Stephen Buhner", "Medical/Herbalism"),
("The Herbal Medicine-Maker's Handbook", "James Green", "Medical/Herbalism"),
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "Medical/Herbalism"),
# Medical — Austere
("Wilderness Medicine", "Paul Auerbach", "Medical/Austere"),
("Medicine for Mountaineering", "James Wilkerson", "Medical/Austere"),
# Medical — Veterinary
("The Chicken Health Handbook", "Gail Damerow", "Medical/Veterinary"),
("Goat Husbandry", "David Mackenzie", "Medical/Veterinary"),
# Power Systems
("The Renewable Energy Handbook", "William Kemp", "Power"),
("Homebrew Wind Power", "Dan Bartmann", "Power"),
("Wind Energy Basics", "Paul Gipe", "Power"),
("12-Volt Bible", "Brotherton", "Power"),
("Wiring a House", "Rex Cauldwell", "Power"),
# Navigation
("Wilderness Navigation", "Bob Burns", "Navigation"),
("Be Expert with Map and Compass", "Bjorn Kjellstrom", "Navigation"),
("Emergency Navigation", "David Burch", "Navigation"),
("The Natural Navigator", "Tristan Gooley", "Navigation"),
("The Essential Wilderness Navigator", "David Seidman", "Navigation"),
# Water Systems
("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster", "Water"),
("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster", "Water"),
("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster", "Water"),
("Water Storage", "Art Ludwig", "Water"),
("The Home Water Supply", "Stu Campbell", "Water"),
# Food Systems
("The Art of Fermentation", "Sandor Katz", "Food"),
("Fermented Vegetables", "Kirsten Shockey", "Food"),
("Mastering Artisan Cheesemaking", "Gianaclis Caldwell", "Food"),
("Home Cheese Making", "Ricki Carroll", "Food"),
("The Art of Natural Cheesemaking", "David Asher", "Food"),
# Permaculture
("Edible Forest Gardens Volume 1", "Dave Jacke", "Permaculture"),
("Edible Forest Gardens Volume 2", "Dave Jacke", "Permaculture"),
("Creating a Forest Garden", "Martin Crawford", "Permaculture"),
("Sepp Holzer's Permaculture", "Sepp Holzer", "Permaculture"),
("The Permaculture Handbook", "Peter Bane", "Permaculture"),
("The Market Gardener", "Jean-Martin Fortier", "Permaculture"),
# Scenario / Emergency
("SAS Survival Handbook", "John Wiseman", "Scenario"),
("Pocket Ref", "Thomas Glover", "Scenario"),
("Deep Survival", "Laurence Gonzales", "Scenario"),
# Foundational Skills
("Back to Basics", "Reader's Digest", "Skills"),
("A Pattern Language", "Christopher Alexander", "Skills"),
]
BASE_LIB = Path("/mnt/library/Acquired")
def search_aa(title, author):
"""Search Anna's Archive and return list of candidate result dicts."""
query = f"{title} {author}"
url = f"{BASE_AA}/search"
params = {"q": query, "ext": "pdf", "lang": "en"}
try:
r = SESSION.get(url, params=params, timeout=20)
r.raise_for_status()
except Exception as e:
log.warning(f"Search failed for '{title}': {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
results = []
seen_md5 = set()
for item in soup.select("a[href^='/md5/']"):
href = item.get("href", "")
md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
if not md5 or len(md5) != 32:
continue
text = item.get_text(" ", strip=True)
if not text or md5 in seen_md5:
continue
seen_md5.add(md5)
results.append({"md5": md5, "text": text, "href": href})
if len(results) >= 5:
break
return results
def get_book_details(md5):
"""Fetch the book detail page and extract useful metadata."""
url = f"{BASE_AA}/md5/{md5}"
try:
r = SESSION.get(url, timeout=20)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(" ", strip=True)
# Extract page count if visible
pages = None
for word in text.split():
if word.isdigit() and 50 < int(word) < 5000:
pages = int(word)
break
return {"pages": pages, "text": text[:500]}
except Exception as e:
log.warning(f"Detail fetch failed for md5={md5}: {e}")
return {}
def try_download(md5, dest_path):
"""Try each libgen mirror until one works. Returns True on success."""
for mirror_tpl in LIBGEN_MIRRORS:
url = mirror_tpl.format(md5=md5)
try:
r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
content_type = r.headers.get("content-type", "")
if r.status_code != 200:
continue
# Some mirrors return an HTML ads page before the real file
if "text/html" in content_type:
# Parse redirect link from ads page
soup = BeautifulSoup(r.text, "html.parser")
dl_link = soup.select_one("a[href*='.pdf']")
if not dl_link:
dl_link = soup.select_one("a[href*='get.php']")
if not dl_link:
continue
actual_url = dl_link["href"]
if not actual_url.startswith("http"):
actual_url = f"https://libgen.is{actual_url}"
r = SESSION.get(actual_url, timeout=120, stream=True)
if r.status_code != 200:
continue
# Stream to disk
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
# Verify it's a real PDF
with open(dest_path, "rb") as f:
header = f.read(4)
if header == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
return True
else:
log.warning(f" [BAD] Not a PDF from {url}")
dest_path.unlink(missing_ok=True)
except Exception as e:
log.warning(f" Mirror failed {url}: {e}")
continue
return False
def process_book(title, author, subdir, dry_run):
"""Full search + download pipeline for one book."""
log.info(f"[SEARCH] '{title}'{author}")
result = {
"title": title,
"author": author,
"status": "NOT FOUND",
"md5": "",
"pages": "",
"file": "",
"notes": "",
}
candidates = search_aa(title, author)
if not candidates:
result["notes"] = "No results from AA search"
return result
# Pick best candidate — prefer one whose text contains author name
best = None
for c in candidates:
if author.split()[-1].lower() in c["text"].lower():
best = c
break
if not best:
best = candidates[0] # take first result if no author match
md5 = best["md5"]
result["md5"] = md5
details = get_book_details(md5)
result["pages"] = details.get("pages", "")
if dry_run:
result["status"] = "DRY RUN — found"
result["notes"] = f"MD5: {md5}"
return result
# Build destination path
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
safe_author = author.split()[-1]
filename = f"{safe_title}_{safe_author}.pdf"
dest = BASE_LIB / subdir / filename
if dest.exists():
result["status"] = "ALREADY EXISTS"
result["file"] = str(dest)
return result
log.info(f" MD5: {md5} — attempting download...")
ok = try_download(md5, dest)
if ok:
result["status"] = "DOWNLOADED"
result["file"] = str(dest)
else:
result["status"] = "MD5 ONLY"
result["notes"] = f"All mirrors failed. MD5: {md5}"
return result
def write_report(results):
REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
downloaded = [r for r in results if r["status"] == "DOWNLOADED"]
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
not_found = [r for r in results if r["status"] == "NOT FOUND"]
already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]
lines = [
f"# Anna's Archive Acquisition Report",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Total searched:** {len(results)}",
f"",
f"| Status | Count |",
f"|--------|-------|",
f"| Downloaded | {len(downloaded)} |",
f"| MD5 only (mirrors failed) | {len(md5_only)} |",
f"| Not found on AA | {len(not_found)} |",
f"| Already in library | {len(already_have)} |",
f"",
]
if downloaded:
lines += ["## Downloaded", ""]
lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
for r in downloaded:
lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
lines.append("")
if md5_only:
lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
for r in md5_only:
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
lines.append("")
if not_found:
lines += ["## Not Found on Anna's Archive", ""]
lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
for r in not_found:
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
lines.append("")
if already_have:
lines += ["## Already in Library", ""]
lines += ["| Title | Author |", "|-------|--------|"]
for r in already_have:
lines.append(f"| {r['title']} | {r['author']} |")
lines.append("")
REPORT_PATH.write_text("\n".join(lines))
log.info(f"Report written to {REPORT_PATH}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
targets = TARGETS[:args.limit] if args.limit else TARGETS
log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")
results = []
for i, (title, author, subdir) in enumerate(targets, 1):
log.info(f"[{i}/{len(targets)}]")
result = process_book(title, author, subdir, args.dry_run)
results.append(result)
log.info(f" -> {result['status']}")
# Polite delay between requests
time.sleep(random.uniform(8, 15))
write_report(results)
print(f"\n-- Summary -----------------------------------------------")
for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
count = sum(1 for r in results if r["status"] == status)
if count:
print(f" {status:<35} {count:>3}")
print(f" Report: {REPORT_PATH}")
if __name__ == "__main__":
main()

478
scripts/aa_download_pass2.py Executable file
View file

@ -0,0 +1,478 @@
#!/usr/bin/env python3
"""
aa_download_pass2.py Second-pass downloader for books that failed in pass 1.
Reads the MD5 list from pass 1 report and tries:
1. Z-Library search by title/author (separate catalog from Libgen)
2. IPFS gateways using AA's IPFS CID (different from MD5 but findable)
3. Alternative Libgen mirrors not tried in pass 1
4. Direct AA slow download with longer timeout + retry
Checkpoint: saves progress to /opt/recon/data/aa_pass2_checkpoint.json
so interrupted runs resume where they left off.
Usage:
python3 /opt/recon/scripts/aa_download_pass2.py [--dry-run]
"""
import json
import time
import random
import logging
import hashlib
import argparse
from pathlib import Path
from datetime import datetime
import requests
from bs4 import BeautifulSoup
LOG_FILE = Path("/opt/recon/logs/aa_download_pass2.log")
REPORT_IN = Path.home() / "projects/recon/aa_acquisition_report.md"
REPORT_OUT = Path.home() / "projects/recon/aa_acquisition_report_pass2.md"
CHECKPOINT = Path("/opt/recon/data/aa_pass2_checkpoint.json")
BASE_LIB = Path("/mnt/library/Acquired")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("aa_pass2")
SESSION = requests.Session()
SESSION.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Accept-Language": "en-US,en;q=0.9",
})
# ── Mirrors to try in order ───────────────────────────────────────────────────
MIRRORS = [
# Libgen alternatives
"https://libgen.li/ads.php?md5={md5}",
"https://library.lol/main/{md5}",
"https://libgen.rocks/get.php?md5={md5}",
# Z-Library direct MD5 endpoint (sometimes works)
"https://z-library.se/md5/{md5}",
# IPFS public gateways — AA uses IPFS for storage
"https://cloudflare-ipfs.com/ipfs/{md5}",
"https://ipfs.io/ipfs/{md5}",
"https://gateway.pinata.cloud/ipfs/{md5}",
]
# ── Books that failed in pass 1 — title, author, md5, subdir ─────────────────
PASS1_FAILURES = [
# Medical/Herbalism
("The Earthwise Herbal Volume 1", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
("The Earthwise Herbal Volume 2", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
("Herbal Antibiotics", "Stephen Buhner", "5839dab78edfdff0d7986fac62b814da", "Medical/Herbalism"),
("The Herbal Medicine-Maker's Handbook", "James Green", "27e8e8a3585705ed194029b69c7d61b1", "Medical/Herbalism"),
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "9b1966f20a32ab4331bfece167be1dd0", "Medical/Herbalism"),
# Medical/Austere
("Wilderness Medicine", "Paul Auerbach", "957818eaa4ec40527bb05902f9ef7c51", "Medical/Austere"),
("Medicine for Mountaineering", "James Wilkerson", "39cb07998f2034206f0c9472e44cb0b4", "Medical/Austere"),
# Medical/Veterinary
("The Chicken Health Handbook", "Gail Damerow", "0ba42fbea034b9a08ec8e2f8d7606efe", "Medical/Veterinary"),
# Power
("The Renewable Energy Handbook", "William Kemp", "475d89fa80aea6c45aa4b1b4b9c5e274", "Power"),
("Homebrew Wind Power", "Dan Bartmann", "0578696d5b1b6bceb3e5e3302c1a31aa", "Power"),
("Wind Energy Basics", "Paul Gipe", "ccbe9d22e0a5e32d61921d20d66a8e05", "Power"),
("12-Volt Bible", "Brotherton", "3f964fa6d730fdf2c3d3e231e87cf692", "Power"),
("Wiring a House", "Rex Cauldwell", "5efcb53450e9eb560210eee40678adcf", "Power"),
# Navigation
("Emergency Navigation", "David Burch", "25e4def9e777b3fa9ca935134732ff9d", "Navigation"),
# Water
("Water Storage", "Art Ludwig", "17c965ec15c6cf4f09b5377b599a5266", "Water"),
("The Home Water Supply", "Stu Campbell", "9b22677d2f8e8b39f7a6bf032187295b", "Water"),
# Food
("Fermented Vegetables", "Kirsten Shockey", "74d3bde876b4c17be66c21fdfa85213e", "Food"),
("The Art of Natural Cheesemaking", "David Asher", "bc0e0829d701fea9beca912d39f8cc74", "Food"),
# Permaculture
("Edible Forest Gardens Volume 1", "Dave Jacke", "6b069c3bb077fdd89d487a363c070fbb", "Permaculture"),
("Edible Forest Gardens Volume 2", "Dave Jacke", "699255bfde7f69285c132a94ec291bf4", "Permaculture"),
("Creating a Forest Garden", "Martin Crawford", "96d71d70dba31ae86e14845f913e557e", "Permaculture"),
("Sepp Holzer's Permaculture", "Sepp Holzer", "32be55a9fce3e31cacd6912069abb410", "Permaculture"),
("The Permaculture Handbook", "Peter Bane", "08cb4492739fda4d01b5a868a408e4a0", "Permaculture"),
("The Market Gardener", "Jean-Martin Fortier", "ac69f6c8c22305b42b539482dc761c19", "Permaculture"),
# Scenario
("SAS Survival Handbook", "John Wiseman", "fa967fd5fcbeb3c9887e22f73e590c64", "Scenario"),
("Pocket Ref", "Thomas Glover", "8e4988ce513a4aa75e7e6c00ee36692b", "Scenario"),
("Deep Survival", "Laurence Gonzales", "9a907ab13b81ea597407fffdb8ea1b04", "Scenario"),
# Skills
("A Pattern Language", "Christopher Alexander","7f5cc06b5399b65a278c4005ccd8d476", "Skills"),
]
def load_checkpoint():
"""Load checkpoint: dict of {title: result_dict} for completed books."""
if CHECKPOINT.exists():
try:
return json.loads(CHECKPOINT.read_text())
except Exception:
pass
return {}
def save_checkpoint(completed):
"""Save checkpoint after each book."""
CHECKPOINT.parent.mkdir(parents=True, exist_ok=True)
tmp = str(CHECKPOINT) + ".tmp"
with open(tmp, "w") as f:
json.dump(completed, f, indent=2)
Path(tmp).replace(CHECKPOINT)
def load_md5s_from_report():
"""Parse MD5 hashes from pass 1 report to pre-populate PASS1_FAILURES."""
if not REPORT_IN.exists():
return {}
md5_map = {}
for line in REPORT_IN.read_text().splitlines():
if "`" in line and len(line) > 30:
parts = line.split("|")
if len(parts) >= 4:
title = parts[1].strip()
md5_cell = parts[3].strip().strip("`")
if len(md5_cell) == 32 and md5_cell.isalnum():
md5_map[title.lower()] = md5_cell
return md5_map
def search_zlib(title, author):
"""Try Z-Library search endpoint."""
try:
url = "https://z-library.se/s/"
params = {"q": f"{title} {author}", "extension[]": "pdf"}
r = SESSION.get(url, params=params, timeout=15)
if r.status_code != 200:
return None
soup = BeautifulSoup(r.text, "html.parser")
# Z-lib book links contain /book/
for a in soup.select("a[href*='/book/']")[:3]:
href = a.get("href", "")
if href:
book_url = f"https://z-library.se{href}" if href.startswith("/") else href
return book_url
except Exception as e:
log.debug(f"Zlib search failed: {e}")
return None
def try_zlib_download(book_url, dest_path):
"""Download from Z-Library book page."""
try:
r = SESSION.get(book_url, timeout=15)
soup = BeautifulSoup(r.text, "html.parser")
dl = soup.select_one("a.addDownloadedBook, a[href*='/dl/'], a.btn-primary[href*='download']")
if not dl:
return False
dl_url = dl["href"]
if not dl_url.startswith("http"):
dl_url = f"https://z-library.se{dl_url}"
r2 = SESSION.get(dl_url, timeout=120, stream=True)
if r2.status_code != 200:
return False
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r2.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"Zlib download failed: {e}")
return False
def try_mirrors(md5, dest_path):
"""Try all mirrors with the MD5."""
import re as _re
for tpl in MIRRORS:
url = tpl.format(md5=md5)
try:
r = SESSION.get(url, timeout=20, stream=True, allow_redirects=True)
if r.status_code != 200:
continue
ctype = r.headers.get("content-type", "")
if "html" in ctype:
soup = BeautifulSoup(r.text, "html.parser")
# For libgen.li ads page, look for get.php with key
dl = None
match = _re.search(r'href="(get\.php\?md5=[^"]+)"', r.text)
if match:
actual = f"https://libgen.li/{match.group(1)}"
else:
dl = (soup.select_one("a[href*='.pdf']") or
soup.select_one("a[href*='get.php']") or
soup.select_one("a[href*='/get/']"))
if not dl:
continue
actual = dl["href"]
if not actual.startswith("http"):
base = url.split("/")[0] + "//" + url.split("/")[2]
actual = base + ("/" if not actual.startswith("/") else "") + actual
r = SESSION.get(actual, timeout=60, stream=True)
if r.status_code != 200:
continue
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {size_mb:.1f}MB via {url}")
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"Mirror {url} failed: {e}")
time.sleep(2)
return False
def get_ipfs_cids(md5):
"""Fetch IPFS CIDs from AA book detail page."""
import re as _re
cids = []
try:
r = SESSION.get(f"https://annas-archive.gl/md5/{md5}", timeout=20)
if r.status_code == 200:
for m in _re.finditer(r'ipfs_cid[:\s]+([A-Za-z0-9]{46,})', r.text):
cids.append(m.group(1))
# Also check for CIDs in href attributes
for m in _re.finditer(r'ipfs://([A-Za-z0-9]{46,})', r.text):
if m.group(1) not in cids:
cids.append(m.group(1))
except Exception as e:
log.debug(f"IPFS CID fetch failed: {e}")
return cids
def try_ipfs_download(cids, dest_path):
"""Try downloading via IPFS public gateways."""
gateways = [
"https://cloudflare-ipfs.com/ipfs/{}",
"https://dweb.link/ipfs/{}",
]
for cid in cids[:3]: # limit to first 3 CIDs
for gw_tpl in gateways:
url = gw_tpl.format(cid)
try:
r = SESSION.get(url, timeout=15, stream=True)
if r.status_code != 200:
continue
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {size_mb:.1f}MB via IPFS {url[:60]}...")
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"IPFS {url} failed: {e}")
time.sleep(1)
return False
def search_aa_fresh(title, author):
"""Fresh AA search on .gl domain for books that weren't found before."""
for domain in ["annas-archive.gl", "annas-archive.se", "annas-archive.org"]:
try:
url = f"https://{domain}/search"
params = {"q": f"{title} {author}", "ext": "pdf", "lang": "en"}
r = SESSION.get(url, params=params, timeout=15)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.select("a[href^='/md5/']"):
text = a.get_text(" ", strip=True)
if not text:
continue
md5 = a["href"].split("/md5/")[-1].split("/")[0].strip()
if len(md5) == 32:
if author.split()[-1].lower() in text.lower() or title.split()[0].lower() in text.lower():
return md5
except Exception:
continue
return None
def process_book(title, author, md5_hint, subdir, dry_run):
result = {
"title": title, "author": author,
"status": "NOT FOUND", "md5": md5_hint,
"file": "", "notes": "",
}
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
safe_author = author.split()[-1]
dest = BASE_LIB / subdir / f"{safe_title}_{safe_author}.pdf"
if dest.exists():
result["status"] = "ALREADY EXISTS"
result["file"] = str(dest)
return result
if dry_run:
result["status"] = "DRY RUN"
return result
# 1. Try Z-Library first (different catalog)
log.info(f" Trying Z-Library...")
zlib_url = search_zlib(title, author)
if zlib_url:
if try_zlib_download(zlib_url, dest):
result["status"] = "DOWNLOADED (Z-Library)"
result["file"] = str(dest)
return result
# 2. If no MD5 from pass 1, do a fresh AA search
md5 = md5_hint
if not md5:
log.info(f" Searching AA for fresh MD5...")
md5 = search_aa_fresh(title, author)
if md5:
result["md5"] = md5
log.info(f" Found MD5: {md5}")
# 3. Try IPFS with real CIDs from AA detail page
if md5:
log.info(f" Fetching IPFS CIDs from AA...")
cids = get_ipfs_cids(md5)
if cids:
log.info(f" Found {len(cids)} IPFS CID(s), trying gateways...")
if try_ipfs_download(cids, dest):
result["status"] = "DOWNLOADED (IPFS)"
result["file"] = str(dest)
return result
# 4. Try all mirrors with MD5
if md5:
log.info(f" Trying mirrors with MD5 {md5}...")
if try_mirrors(md5, dest):
result["status"] = "DOWNLOADED (mirror)"
result["file"] = str(dest)
return result
result["status"] = "MD5 ONLY"
result["notes"] = f"MD5 confirmed, all mirrors failed: {md5}"
else:
result["notes"] = "Not found on AA or Z-Library"
return result
def write_report(results):
downloaded = [r for r in results if "DOWNLOADED" in r["status"]]
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
not_found = [r for r in results if r["status"] == "NOT FOUND"]
existing = [r for r in results if r["status"] == "ALREADY EXISTS"]
lines = [
"# AA Acquisition Report -- Pass 2",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Searched:** {len(results)} | **Downloaded:** {len(downloaded)} | "
f"**MD5 only:** {len(md5_only)} | **Not found:** {len(not_found)}",
"",
]
if downloaded:
lines += ["## Downloaded", "",
"| Title | Author | Via | File |",
"|-------|--------|-----|------|"]
for r in downloaded:
lines.append(f"| {r['title']} | {r['author']} | {r['status']} | `{Path(r['file']).name}` |")
lines.append("")
if existing:
lines += ["## Already in Library", "",
"| Title | Author |",
"|-------|--------|"]
for r in existing:
lines.append(f"| {r['title']} | {r['author']} |")
lines.append("")
if md5_only:
lines += ["## MD5 Known -- All Mirrors Failed", "",
"| Title | Author | MD5 |",
"|-------|--------|-----|"]
for r in md5_only:
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` |")
lines.append("")
if not_found:
lines += ["## Not Found Anywhere", "",
"| Title | Author | Notes |",
"|-------|--------|-------|"]
for r in not_found:
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
lines.append("")
REPORT_OUT.parent.mkdir(parents=True, exist_ok=True)
REPORT_OUT.write_text("\n".join(lines))
log.info(f"Report written to {REPORT_OUT}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
# Load any MD5s captured in pass 1
md5_map = load_md5s_from_report()
targets = []
for title, author, md5_hint, subdir in PASS1_FAILURES:
md5 = md5_hint or md5_map.get(title.lower(), "")
targets.append((title, author, md5, subdir))
# Load checkpoint
completed = load_checkpoint()
if completed:
log.info(f"Resuming: {len(completed)} books already processed in previous run")
log.info(f"Pass 2: {len(targets)} books | dry_run={args.dry_run}")
results = []
for i, (title, author, md5, subdir) in enumerate(targets, 1):
# Check checkpoint — skip already-processed books
if title in completed and not args.dry_run:
result = completed[title]
results.append(result)
log.info(f"[{i}/{len(targets)}] {title} — SKIPPED (checkpoint: {result['status']})")
continue
log.info(f"[{i}/{len(targets)}] {title} -- {author}")
result = process_book(title, author, md5, subdir, args.dry_run)
results.append(result)
log.info(f" -> {result['status']}")
# Save checkpoint after each book (not in dry-run)
if not args.dry_run:
completed[title] = result
save_checkpoint(completed)
time.sleep(random.uniform(6, 12))
write_report(results)
print(f"\n-- Pass 2 Summary ----------------------------------------")
for status in ["DOWNLOADED (Z-Library)", "DOWNLOADED (IPFS)", "DOWNLOADED (mirror)", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN"]:
count = sum(1 for r in results if r["status"] == status)
if count:
print(f" {status:<35} {count:>3}")
print(f" Report: {REPORT_OUT}")
if __name__ == "__main__":
main()

64
scripts/backup.sh Executable file
View file

@ -0,0 +1,64 @@
#!/bin/bash
# RECON Backup Script
# Backs up the precious data: concept JSONs, text extracts, SQLite DB
# Qdrant is NOT backed up — rebuilt from JSONs via `recon rebuild`
# Destination: Contabo VPS (100.64.0.1) via rsync+SSH
set -euo pipefail
RECON_DIR="/opt/recon"
DATA_DIR="$RECON_DIR/data"
LOG_FILE="$RECON_DIR/logs/backup.log"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_HOST="root@100.64.0.1"
BACKUP_BASE="/opt/backups/recon"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
mkdir -p "$RECON_DIR/logs"
log "=== RECON Backup Starting ==="
# ── 1. SQLite DB (small, fast, critical) ──
log "Backing up recon.db..."
LOCAL_DB_BACKUP="/tmp/recon_${DATE}.db"
sqlite3 "$DATA_DIR/recon.db" ".backup '$LOCAL_DB_BACKUP'"
rsync -az "$LOCAL_DB_BACKUP" "$BACKUP_HOST:$BACKUP_BASE/recon_${DATE}.db"
rm -f "$LOCAL_DB_BACKUP"
# Keep last 7 daily DB backups on remote
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/recon_*.db 2>/dev/null | tail -n +8 | xargs rm -f 2>/dev/null || true"
log " recon.db backed up"
# ── 2. Concept JSONs (THE PRECIOUS DATA — $130+ of Gemini work) ──
log "Syncing concept JSONs..."
rsync -az --delete "$DATA_DIR/concepts/" "$BACKUP_HOST:$BACKUP_BASE/concepts/"
CONCEPT_COUNT=$(find "$DATA_DIR/concepts/" -name "*.json" 2>/dev/null | wc -l)
log " concepts synced ($CONCEPT_COUNT JSON files)"
# ── 3. Text extracts (regenerable but expensive in time) ──
log "Syncing text extracts..."
rsync -az --delete "$DATA_DIR/text/" "$BACKUP_HOST:$BACKUP_BASE/text/"
TEXT_COUNT=$(find "$DATA_DIR/text/" -maxdepth 1 -type d 2>/dev/null | wc -l)
log " text synced ($((TEXT_COUNT - 1)) document dirs)"
# ── 4. Intel feeds ──
if [ -d "$DATA_DIR/intel" ]; then
log "Syncing intel feeds..."
rsync -az --delete "$DATA_DIR/intel/" "$BACKUP_HOST:$BACKUP_BASE/intel/"
log " intel synced"
fi
# ── 5. Config files ──
log "Backing up config..."
rsync -az "$RECON_DIR/config.yaml" "$BACKUP_HOST:$BACKUP_BASE/config_${DATE}.yaml"
rsync -az "$RECON_DIR/.env" "$BACKUP_HOST:$BACKUP_BASE/env_${DATE}" 2>/dev/null || true
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/config_*.yaml 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/env_* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
log " config backed up"
# ── Summary ──
BACKUP_SIZE=$(ssh "$BACKUP_HOST" "du -sh $BACKUP_BASE" | cut -f1)
log "=== Backup Complete: $BACKUP_SIZE on Contabo ==="

449
scripts/cleanup_outliers.py Executable file
View file

@ -0,0 +1,449 @@
#!/usr/bin/env python3
"""
cleanup_outliers.py Three-pass cleanup of RECON concept data.
Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB
Usage:
python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
"""
import json
import time
import random
import logging
import argparse
import threading
import sqlite3
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import FieldCondition, MatchAny, Filter
import sys, os
sys.path.insert(0, '/opt/recon')
from lib.utils import get_config, setup_logging
LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("cleanup_outliers")
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
DB_PATH = Path("/opt/recon/data/recon.db")
CANONICAL_DOMAINS = {
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
}
# Non-canonical → canonical remap
OUTLIER_MAP = {
"Zoology": "Sustainment Systems",
"Botany": "Sustainment Systems",
"Nature Lore": "Sustainment Systems",
"Ecology": "Sustainment Systems",
"Navigational Astronomy": "Navigation",
"Troubleshooting": "Foundational Skills",
"Chemistry": "Foundational Skills",
"Metallurgy": "Foundational Skills",
"Weird Science": "Foundational Skills",
"Philosophy of physics": "Foundational Skills",
"Physics": "Foundational Skills",
"Cell biology": "Foundational Skills",
"Economics": "Leadership",
"Business": "Leadership",
"Safety": "Security",
"Law Enforcement": "Security",
"Security & Intelligence": "Security",
"Fire Weather": "Scenario Playbooks",
"Legal": "Leadership",
# Discard — replace with closest real domain
"Site News": "Foundational Skills",
"Paleogeography": "Foundational Skills",
"Chemical Manipulation": "Foundational Skills",
}
# Junk URL patterns — pages with no knowledge value
JUNK_URL_PATTERNS = [
# rocketstoves.com nav/template garbage
"rocketstoves.com/favicon",
"rocketstoves.com/cropped-favicon",
"rocketstoves.com/layouts/",
"rocketstoves.com/sample",
"rocketstoves.com/templates/",
"rocketstoves.com/hello-world",
"rocketstoves.com/blog-forthcoming",
"rocketstoves.com/contact",
"rocketstoves.com/acknowledgements",
"rocketstoves.com/ja3",
"rocketstoves.com/juxtapositions",
"rocketstoves.com/no-name-soi",
"rocketstoves.com/big4",
"rocketstoves.com/roof",
"rocketstoves.com/rmh_dloadcover",
"rocketstoves.com/pedcover",
"rocketstoves.com/laundry-to-landscape",
"rocketstoves.com/barreloven",
# NRCS calendar/event noise
"nrcs.usda.gov/events/",
"nrcs.usda.gov/state-offices/massachusetts",
"nrcs.usda.gov/state-offices/nebraska",
"nrcs.usda.gov/state-offices/oklahoma",
"nrcs.usda.gov/state-offices/utah",
"nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
# deeranddeerhunting trophy hunt videos (no knowledge value)
"deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
# eattheweeds non-content pages
"eattheweeds.com/media-interviews-with-green-deane",
"eattheweeds.com/motorcycles-and-mushrooms",
"eattheweeds.com/sunny-savage",
# foragersharvest nav pages
"foragersharvest.com/contact",
"foragersharvest.com/podcasts",
# motherearthnews classifieds/nav
"motherearthnews.com/classifieds/",
"motherearthnews.com/biographies/",
]
CLASSIFY_PROMPT = """\
Classify this knowledge concept into one or more domains.
VALID DOMAINS (use ONLY these exact strings):
Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
Concept title: {title}
Concept tags: {subdomain}
Concept preview: {content}
Return ONLY valid JSON, no markdown:
{{"domain": ["Domain Name"]}}
Rules:
- Never return empty domain list
- Medical content, herbs, first aid, veterinary Medical
- Food growing, foraging, hunting, livestock Sustainment Systems
- Food preservation, canning, storage Food Systems
- Solar, wind, batteries, generators Power Systems
- Water sourcing, filtration, sanitation Water Systems
"""
def load_gemini_keys():
keys = []
for line in Path("/opt/recon/.env").read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
def classify_concept(title, subdomains, content, key):
prompt = CLASSIFY_PROMPT.format(
title=title or "(untitled)",
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
content=str(content)[:300] if content else "(none)",
)
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "application/json"}
)
for attempt in range(4):
try:
resp = model.generate_content(prompt)
data = json.loads(resp.text)
domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
if domains:
return domains
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503"]):
time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
else:
break
return ["Foundational Skills"]
# ── PASS 1: Remap outlier domains ────────────────────────────────────────────
def remap_concept_domains(domains):
"""Remap any outlier domain names in a domain list."""
result = set()
changed = False
for d in domains:
if d in CANONICAL_DOMAINS:
result.add(d)
elif d in OUTLIER_MAP:
result.add(OUTLIER_MAP[d])
changed = True
else:
changed = True # drop unknown
return list(result), changed
def pass1_remap_outliers(qdrant, collection, dry_run):
log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
outlier_names = list(OUTLIER_MAP.keys())
stats = defaultdict(int)
# Scroll through Qdrant finding affected vectors
offset = None
affected_points = []
while True:
results, offset = qdrant.scroll(
collection_name=collection,
scroll_filter=Filter(
must=[FieldCondition(
key="domain",
match=MatchAny(any=outlier_names)
)]
),
limit=500,
with_payload=True,
with_vectors=False,
offset=offset,
)
affected_points.extend(results)
if offset is None:
break
log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")
for point in affected_points:
payload = point.payload
old_domains = payload.get("domain", [])
if isinstance(old_domains, str):
old_domains = [old_domains]
new_domains, changed = remap_concept_domains(old_domains)
if not new_domains:
new_domains = ["Foundational Skills"]
if changed:
stats["qdrant_updated"] += 1
if not dry_run:
qdrant.set_payload(
collection_name=collection,
payload={"domain": new_domains},
points=[point.id],
)
# Also fix concept JSON files on disk
json_fixed = 0
for window_file in CONCEPTS_DIR.rglob("window_*.json"):
try:
with open(window_file, "r", encoding="utf-8") as f:
concepts = json.load(f)
except Exception:
continue
if not isinstance(concepts, list):
continue
file_changed = False
for concept in concepts:
if not isinstance(concept, dict):
continue
raw = concept.get("domain", [])
if isinstance(raw, str):
raw = [raw]
new, changed = remap_concept_domains(raw)
if changed:
concept["domain"] = new if new else ["Foundational Skills"]
file_changed = True
if file_changed:
json_fixed += 1
if not dry_run:
with open(window_file, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
return stats
# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────
def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
log.info("=== PASS 2: Re-enriching empty domain concepts ===")
stats = defaultdict(int)
# Find empty domain points in Qdrant
offset = None
empty_points = []
while True:
results, offset = qdrant.scroll(
collection_name=collection,
limit=500,
with_payload=True,
with_vectors=False,
offset=offset,
)
for r in results:
d = r.payload.get("domain", [])
if not d or d == [] or d == [""]:
empty_points.append(r)
if offset is None:
break
log.info(f"Found {len(empty_points)} points with empty domains")
for point in empty_points:
payload = point.payload
title = payload.get("title", "")
subdomains = payload.get("subdomain", [])
content = payload.get("content", payload.get("summary", ""))
key = key_rotator.next()
new_domains = classify_concept(title, subdomains, content, key)
stats["classified"] += 1
if not dry_run:
qdrant.set_payload(
collection_name=collection,
payload={"domain": new_domains},
points=[point.id],
)
# Also update the concept JSON on disk
doc_hash = payload.get("doc_hash", "")
if doc_hash:
doc_concepts_dir = CONCEPTS_DIR / doc_hash
if doc_concepts_dir.exists():
for wf in doc_concepts_dir.glob("window_*.json"):
try:
with open(wf, "r", encoding="utf-8") as f:
concepts = json.load(f)
changed = False
for c in concepts:
if isinstance(c, dict) and c.get("title") == title:
d = c.get("domain", [])
if not d or d == []:
c["domain"] = new_domains
changed = True
if changed and not dry_run:
with open(wf, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
except Exception:
pass
time.sleep(0.05)
log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
return stats
# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────
def is_junk_url(url):
url_lower = url.lower()
return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)
def pass3_purge_junk(qdrant, collection, dry_run):
log.info("=== PASS 3: Purging junk URLs ===")
stats = defaultdict(int)
# Scroll all web-source points and find junk
offset = None
junk_point_ids = []
junk_doc_hashes = set()
while True:
results, offset = qdrant.scroll(
collection_name=collection,
scroll_filter=Filter(
must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
),
limit=500,
with_payload=True,
with_vectors=False,
offset=offset,
)
for r in results:
filename = r.payload.get("filename", "")
doc_hash = r.payload.get("doc_hash", "")
if is_junk_url(filename):
junk_point_ids.append(r.id)
if doc_hash:
junk_doc_hashes.add(doc_hash)
if offset is None:
break
log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")
if not dry_run and junk_point_ids:
# Delete in batches
batch_size = 500
for i in range(0, len(junk_point_ids), batch_size):
batch = junk_point_ids[i:i + batch_size]
qdrant.delete(collection_name=collection, points_selector=batch)
log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")
# Mark junk docs as skipped in SQLite
conn = sqlite3.connect(str(DB_PATH))
for doc_hash in junk_doc_hashes:
conn.execute(
"UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
(doc_hash,)
)
conn.commit()
conn.close()
log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")
stats["junk_vectors"] = len(junk_point_ids)
stats["junk_docs"] = len(junk_doc_hashes)
log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
return stats
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--skip-pass", type=int, action="append", default=[])
args = parser.parse_args()
config = get_config()
keys = load_gemini_keys()
rotator = KeyRotator(keys)
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=60
)
collection = config['vector_db']['collection']
log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")
if 1 not in args.skip_pass:
pass1_remap_outliers(qdrant, collection, args.dry_run)
if 2 not in args.skip_pass:
pass2_empty_domains(qdrant, collection, rotator, args.dry_run)
if 3 not in args.skip_pass:
pass3_purge_junk(qdrant, collection, args.dry_run)
log.info("All passes complete.")
if __name__ == "__main__":
main()

215
scripts/domain_reenrich.py Executable file
View file

@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
domain_reenrich.py Re-enriches solo-Reference concepts that domain_remap.py
couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
with a lightweight classification-only prompt, updates domain in-place.
Usage:
python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
Reads: /opt/recon/data/remap_unknowns.jsonl
Writes: domain field in-place in window JSON files
Log: /opt/recon/logs/domain_reenrich.log
"""
import json
import time
import random
import logging
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(),
]
)
log = logging.getLogger("domain_reenrich")
CANONICAL_DOMAINS = [
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
]
DOMAIN_SET = set(CANONICAL_DOMAINS)
CLASSIFY_PROMPT = """\
Classify this knowledge concept into one or more domains.
VALID DOMAINS (use ONLY these exact strings, no others):
{domains}
Concept title: {title}
Concept tags: {subdomain}
Concept preview: {content}
Return ONLY valid JSON, no markdown, no explanation:
{{"domain": ["Domain Name"]}}
Rules:
- Use only the domain strings listed above, spelled exactly
- If genuinely multi-domain assign all that apply
- Never return empty domain list pick the closest match
- Medical content, herbs, first aid, veterinary Medical
- Food growing, foraging, hunting, livestock Sustainment Systems
- Food preservation, canning, storage Food Systems
- Solar, wind, batteries, generators Power Systems
- Water sourcing, filtration, sanitation Water Systems
"""
def load_gemini_keys():
env = Path("/opt/recon/.env")
keys = []
for line in env.read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
def classify_concept(title, subdomains, content, key):
prompt = CLASSIFY_PROMPT.format(
domains="\n".join(f" {d}" for d in CANONICAL_DOMAINS),
title=title or "(untitled)",
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
content=content[:300] if content else "(none)",
)
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "application/json"}
)
for attempt in range(4):
try:
resp = model.generate_content(prompt)
data = json.loads(resp.text)
domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
if domains:
return domains
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
time.sleep(delay)
else:
break
return ["Foundational Skills"] # last-resort fallback
def process_unknown(item, key_rotator):
filepath = Path(item["filepath"])
title = item.get("title", "")
subdomains = item.get("subdomain", [])
content = item.get("content_preview", "")
if not filepath.exists():
return "file_missing"
try:
with open(filepath, "r", encoding="utf-8") as f:
concepts = json.load(f)
except Exception:
return "read_error"
if not isinstance(concepts, list):
return "not_list"
# Find this concept by title and update its domain
matched = False
for concept in concepts:
if not isinstance(concept, dict):
continue
if concept.get("title", "") == title:
raw = concept.get("domain", [])
if isinstance(raw, str):
raw = [raw]
# Only re-enrich if still stuck on Reference
if raw == ["Reference"] or raw == []:
key = key_rotator.next()
new_domains = classify_concept(title, subdomains, content, key)
concept["domain"] = new_domains
concept["_reenriched"] = True
matched = True
break
if not matched:
return "already_fixed"
try:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
except Exception:
return "write_error"
return "ok"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--workers", type=int, default=16)
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
keys = load_gemini_keys()
if not keys:
log.error("No Gemini keys found in .env")
return
rotator = KeyRotator(keys)
unknowns = []
with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
unknowns.append(json.loads(line))
if args.limit:
unknowns = unknowns[:args.limit]
total = len(unknowns)
log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
results = defaultdict(int)
lock = threading.Lock()
done = 0
with ThreadPoolExecutor(max_workers=args.workers) as ex:
futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
for future in as_completed(futures):
status = future.result()
with lock:
results[status] += 1
done += 1
if done % 5000 == 0:
pct = done / total * 100
log.info(f" Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
time.sleep(0.05)
log.info("── Final Results ─────────────────────────────────────────────")
for status, count in sorted(results.items(), key=lambda x: -x[1]):
log.info(f" {status:<25} {count:>10,}")
log.info(f" Total: {total:,}")
if __name__ == "__main__":
main()

428
scripts/domain_remap.py Executable file
View file

@ -0,0 +1,428 @@
#!/usr/bin/env python3
"""
domain_remap.py Fix RECON concept domain classifications without API calls.
What this does:
1. Strips "Reference" from concepts that have other real domains
2. Remaps variant domain spellings to canonical names
3. Reclassifies solo-Reference concepts using their subdomain tags
4. Writes a JSONL file of true unknowns for API re-enrichment
Each window file is a JSON array of concept dicts.
Field names: "domain" (list), "subdomain" (list)
Usage:
python3 /opt/recon/scripts/domain_remap.py --dry-run # report only
python3 /opt/recon/scripts/domain_remap.py # apply fixes
python3 /opt/recon/scripts/domain_remap.py --workers 16
"""
import json
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
UNKNOWNS_OUTPUT = Path("/opt/recon/data/remap_unknowns.jsonl")
CANONICAL_DOMAINS = {
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
}
# Variant → Canonical mapping
VARIANT_MAP = {
# Defense & Tactics
"Tactical Ops": "Defense & Tactics",
"Tactical_Ops": "Defense & Tactics",
"Tactical Operations": "Defense & Tactics",
"Tactical": "Defense & Tactics",
"Tactical Skills": "Defense & Tactics",
"Tactics": "Defense & Tactics",
"Tactics & Defense": "Defense & Tactics",
"Reconnaissance": "Defense & Tactics",
"Fire Support": "Defense & Tactics",
"Improvised Munitions": "Defense & Tactics",
"Military Intelligence": "Defense & Tactics",
"Military History": "Defense & Tactics",
"Military Engineering": "Defense & Tactics",
# Medical
"Medical Care": "Medical",
"Medical Alternatives": "Medical",
"Medical/Dental": "Medical",
"Medical & Dental": "Medical",
"medical": "Medical",
"Medical Awareness": "Medical",
"Medical Disasters": "Medical",
"Medical Emergency Survival": "Medical",
"Medical Procedures": "Medical",
"Medical Treatment": "Medical",
"Medical Science": "Medical",
"Medical History": "Medical",
"Medical Diagnosis": "Medical",
"Medical Skills": "Medical",
"Medical Supply": "Medical",
"Medical Gear": "Medical",
"Medical Kits": "Medical",
"Medical Logistics": "Logistics",
"Medical First Aid": "Medical",
"Medical Ethics": "Medical",
"Medical Reference Ranges": "Medical",
"Medical andSurgical Hints": "Medical",
"Medical Aspects of Radiation Injury": "Medical",
"Medical Uses": "Medical",
"Medical Care in Developing Countries": "Medical",
"Survival Medicine": "Medical",
"Emergency War Surgery": "Medical",
"First Aid": "Medical",
"First Aid and Life Saving": "Medical",
"Veterinary Medicine": "Medical",
"Veterinary Hygiene": "Medical",
"Veterinary": "Medical",
"Pharmacology": "Medical",
"Public Health": "Medical",
"Health": "Medical",
# Food Systems
"Food_Systems": "Food Systems",
"Food_systems": "Food Systems",
"food_systems": "Food Systems",
"Food Preservation": "Food Systems",
"Food Safety": "Food Systems",
"Food Security": "Food Systems",
"Food & Nutrition": "Food Systems",
"Diet & Nutrition": "Food Systems",
"Culinary Arts": "Food Systems",
"Foodprocessing": "Food Systems",
"Food": "Food Systems",
# Sustainment Systems
"Sustainment_Systems": "Sustainment Systems",
"Agriculture": "Sustainment Systems",
"Agriculture & Natural Resources": "Sustainment Systems",
"Agriculture and Natural Resources": "Sustainment Systems",
"Horticulture": "Sustainment Systems",
"Gardening": "Sustainment Systems",
"Hydroponics": "Sustainment Systems",
"Survival Skills": "Sustainment Systems",
# Foundational Skills
"Foundational_Skills": "Foundational Skills",
"Primitive Living Skills": "Foundational Skills",
"Woodcraft": "Foundational Skills",
"Home Workshop": "Foundational Skills",
"Science": "Foundational Skills",
"Engineering": "Foundational Skills",
"Construction": "Foundational Skills",
"Industrial Processes": "Foundational Skills",
"Machine Technology": "Foundational Skills",
"Training": "Foundational Skills",
"Education": "Foundational Skills",
# Off-Grid Systems
"Off-Grid_Systems": "Off-Grid Systems",
"Appropriate Technology": "Off-Grid Systems",
# Power Systems
"Homebrewed Electricity": "Power Systems",
"Renewable Energy": "Power Systems",
"Renewable Energy FAQs": "Power Systems",
"Alternative Fuels": "Power Systems",
"Power_Systems": "Power Systems",
# Water Systems
"Water_Systems": "Water Systems",
# Community Coordination
"Community_Coordination": "Community Coordination",
"Community_coordination": "Community Coordination",
"Community": "Community Coordination",
# Leadership
"Leadership & Planning": "Leadership",
"Planning": "Leadership",
"Administration": "Leadership",
"Governance": "Leadership",
"Government": "Leadership",
# Communications
"Emergency Communications": "Communications",
# Security
"Security Systems": "Security",
# Logistics
"Transportation": "Logistics",
# Scenario Playbooks
"General Preparedness": "Scenario Playbooks",
"Emergency Preparedness": "Scenario Playbooks",
"Emergency Management": "Scenario Playbooks",
"Wilderness Preparedness": "Scenario Playbooks",
"Urban Preparedness": "Scenario Playbooks",
"Winter Preparedness": "Scenario Playbooks",
# Discard (noise domains)
"Humor": None,
"Recreation": None,
"Business": None,
"Finance": None,
"Economics": None,
"Economics/Finances": None,
"Weird Science": None,
}
# Subdomain keyword → canonical domain (for solo-Reference reclassification)
SUBDOMAIN_MAP = {
"first aid": "Medical",
"emergency care": "Medical",
"emergency medicine": "Medical",
"trauma": "Medical",
"anatomy": "Medical",
"oral rehydration": "Medical",
"ors": "Medical",
"pharmacology": "Medical",
"toxicology": "Medical",
"antidote": "Medical",
"nerve agent": "Defense & Tactics",
"chemical warfare": "Defense & Tactics",
"biological warfare": "Defense & Tactics",
"nbc": "Defense & Tactics",
"infectious disease": "Medical",
"microbiology": "Medical",
"virology": "Medical",
"bacteriology": "Medical",
"pediatric": "Medical",
"surgery": "Medical",
"wound care": "Medical",
"veterinary": "Medical",
"dental": "Medical",
"dentistry": "Medical",
"herbal": "Medical",
"medicinal plant": "Medical",
"medicinal herb": "Medical",
"herbalism": "Medical",
"food preservation": "Food Systems",
"canning": "Food Systems",
"fermentation": "Food Systems",
"food storage": "Food Systems",
"food safety": "Food Systems",
"cooking": "Food Systems",
"food processing": "Food Systems",
"agriculture": "Sustainment Systems",
"soil": "Sustainment Systems",
"permaculture": "Sustainment Systems",
"agroforestry": "Sustainment Systems",
"livestock": "Sustainment Systems",
"animal husbandry": "Sustainment Systems",
"beekeeping": "Sustainment Systems",
"foraging": "Sustainment Systems",
"hunting": "Sustainment Systems",
"fishing": "Sustainment Systems",
"gardening": "Sustainment Systems",
"mycology": "Sustainment Systems",
"mushroom": "Sustainment Systems",
"water purification": "Water Systems",
"water filtration": "Water Systems",
"water sanitation": "Water Systems",
"water disinfection": "Water Systems",
"water storage": "Water Systems",
"well construction": "Water Systems",
"rainwater": "Water Systems",
"solar": "Power Systems",
"wind turbine": "Power Systems",
"battery": "Power Systems",
"batteries": "Power Systems",
"generator": "Power Systems",
"photovoltaic": "Power Systems",
"charge controller": "Power Systems",
"inverter": "Power Systems",
"biogas": "Off-Grid Systems",
"biomass": "Off-Grid Systems",
"wood gasification": "Off-Grid Systems",
"rocket stove": "Off-Grid Systems",
"mechanical system": "Off-Grid Systems",
"power transmission": "Off-Grid Systems",
"radio": "Communications",
"ham radio": "Communications",
"amateur radio": "Communications",
"antenna": "Communications",
"meshtastic": "Communications",
"encryption": "Communications",
"navigation": "Navigation",
"celestial navigation": "Navigation",
"land navigation": "Navigation",
"map reading": "Navigation",
"compass": "Navigation",
"pottery": "Foundational Skills",
"ceramics": "Foundational Skills",
"blacksmithing": "Foundational Skills",
"woodworking": "Foundational Skills",
"leatherwork": "Foundational Skills",
"textile": "Foundational Skills",
"masonry": "Foundational Skills",
"metalworking": "Foundational Skills",
"historical technology": "Foundational Skills",
"weapons": "Defense & Tactics",
"firearms": "Defense & Tactics",
"ballistics": "Defense & Tactics",
"tactics": "Defense & Tactics",
"perimeter": "Security",
"surveillance": "Security",
"supply chain": "Logistics",
"logistics": "Logistics",
"leadership": "Leadership",
"governance": "Leadership",
"community": "Community Coordination",
"emergency preparedness": "Scenario Playbooks",
"disaster": "Scenario Playbooks",
"evacuation": "Scenario Playbooks",
}
def remap_domains(domains):
"""Remap a list of domain strings — variants to canonical, strip Reference."""
result = set()
for d in domains:
if d == "Reference":
continue
if d in CANONICAL_DOMAINS:
result.add(d)
elif d in VARIANT_MAP:
mapped = VARIANT_MAP[d]
if mapped: # None means discard
result.add(mapped)
# Unknown non-canonical domains: drop them
return list(result)
def classify_by_subdomain(subdomains):
"""Try to infer canonical domain(s) from subdomain keyword matching."""
found = set()
for sd in subdomains:
sd_lower = sd.lower().strip()
for key, domain in SUBDOMAIN_MAP.items():
if key in sd_lower:
found.add(domain)
return list(found) if found else None
def process_window_file(filepath, dry_run):
"""Process one window JSON file (array of concepts). Returns per-file stats."""
stats = defaultdict(int)
unknowns = []
try:
with open(filepath, "r", encoding="utf-8") as f:
concepts = json.load(f)
except Exception as e:
return {"parse_error": 1}, []
if not isinstance(concepts, list):
return {"skip_not_list": 1}, []
modified = False
for concept in concepts:
if not isinstance(concept, dict):
continue
raw_domains = concept.get("domain", [])
if isinstance(raw_domains, str):
raw_domains = [raw_domains]
subdomains = concept.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
has_reference = "Reference" in raw_domains
non_reference = [d for d in raw_domains if d != "Reference"]
if not has_reference:
# No Reference — just fix any variant names
remapped = remap_domains(raw_domains)
if set(remapped) != set(raw_domains):
concept["domain"] = remapped
modified = True
stats["variant_remapped"] += 1
else:
stats["no_change"] += 1
continue
# Has Reference — what else does it have?
remapped_others = remap_domains(non_reference)
if remapped_others:
# Reference + real domains: drop Reference, keep the rest
concept["domain"] = remapped_others
modified = True
stats["reference_stripped"] += 1
continue
# Solo Reference (or Reference + only-noise): try subdomain lookup
inferred = classify_by_subdomain(subdomains)
if inferred:
concept["domain"] = inferred
concept["_reclassified_from_reference"] = True
modified = True
stats["subdomain_reclassified"] += 1
continue
# True unknown — needs API re-enrichment
unknowns.append({
"filepath": str(filepath),
"title": concept.get("title", ""),
"subdomain": subdomains,
"content_preview": str(concept.get("content", concept.get("summary", "")))[:300],
})
stats["needs_enrichment"] += 1
if modified and not dry_run:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
return dict(stats), unknowns
def main():
parser = argparse.ArgumentParser(description="Remap RECON concept domains")
parser.add_argument("--dry-run", action="store_true", help="Report without writing")
parser.add_argument("--workers", type=int, default=16)
args = parser.parse_args()
print(f"[REMAP] Scanning {CONCEPTS_DIR}")
print(f"[REMAP] Dry run: {args.dry_run} | Workers: {args.workers}")
window_files = [
f for f in CONCEPTS_DIR.rglob("window_*.json")
]
print(f"[REMAP] Found {len(window_files):,} window files")
total_stats = defaultdict(int)
all_unknowns = []
lock = threading.Lock()
done = 0
with ThreadPoolExecutor(max_workers=args.workers) as ex:
futures = {ex.submit(process_window_file, f, args.dry_run): f for f in window_files}
for future in as_completed(futures):
file_stats, unknowns = future.result()
with lock:
for k, v in file_stats.items():
total_stats[k] += v
all_unknowns.extend(unknowns)
done += 1
if done % 5000 == 0:
print(f" {done:,}/{len(window_files):,} files processed...")
print("\n── Results ─────────────────────────────────────────────────")
for status, count in sorted(total_stats.items(), key=lambda x: -x[1]):
print(f" {status:<35} {count:>10,}")
total_concepts = sum(total_stats.values())
print(f"\n Total concepts processed: {total_concepts:>10,}")
print(f" True unknowns for re-enrichment:{len(all_unknowns):>10,}")
if not args.dry_run and all_unknowns:
with open(UNKNOWNS_OUTPUT, "w", encoding="utf-8") as f:
for item in all_unknowns:
f.write(json.dumps(item) + "\n")
print(f"\n Unknowns written to: {UNKNOWNS_OUTPUT}")
if args.dry_run:
print("\n [DRY RUN] No files were modified.")
if __name__ == "__main__":
main()

469
scripts/migrate_domains.py Normal file
View file

@ -0,0 +1,469 @@
#!/usr/bin/env python3
"""
migrate_domains.py Reclassify 5 legacy domains via Gemini Flash.
Targets: Sustainment Systems, Off-Grid Systems, Defense & Tactics,
Community Coordination, Leadership
Maps each to one of the 18 approved domains. 16 parallel workers,
checkpoint file, crash-safe, incremental saves, progress every 5,000.
Usage:
python3 /tmp/migrate_domains.py [--dry-run] [--workers 16] [--limit N]
"""
import json
import time
import random
import logging
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import FieldCondition, MatchValue, Filter
# Suppress noisy HTTP logs
import logging as _logging
_logging.getLogger("httpx").setLevel(_logging.WARNING)
_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
LOG_FILE = Path("/opt/recon/logs/migrate_domains.log")
CHECKPOINT_FILE = Path("/opt/recon/data/migrate_domains_checkpoint.json")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("migrate_domains")
# ── Constants ───────────────────────────────────────────────────────────────
VALID_DOMAINS = {
'Agriculture & Livestock', 'Civil Organization', 'Communications',
'Food Systems', 'Foundational Skills', 'Logistics', 'Medical',
'Navigation', 'Operations', 'Power Systems', 'Preservation & Storage',
'Security', 'Shelter & Construction', 'Technology', 'Tools & Equipment',
'Vehicles', 'Water Systems', 'Wilderness Skills',
}
SOURCE_DOMAINS = {
'Sustainment Systems', 'Off-Grid Systems', 'Defense & Tactics',
'Community Coordination', 'Leadership',
}
DOMAIN_LIST_STR = ', '.join(sorted(VALID_DOMAINS))
CLASSIFY_PROMPT = """\
Classify this knowledge concept into exactly one domain from this list:
Agriculture & Livestock, Civil Organization, Communications, Food Systems, Foundational Skills, Logistics, Medical, Navigation, Operations, Power Systems, Preservation & Storage, Security, Shelter & Construction, Technology, Tools & Equipment, Vehicles, Water Systems, Wilderness Skills
Return ONLY the exact domain string, nothing else. No explanation, no punctuation, no quotes.
Content: {content}
Summary: {summary}
Subdomain: {subdomain}
"""
DOMAIN_FALLBACK = 'Foundational Skills'
# ── Key management ──────────────────────────────────────────────────────────
def load_gemini_keys():
keys = []
env_path = Path("/opt/recon/.env")
if not env_path.exists():
raise FileNotFoundError(f"{env_path} not found")
for line in env_path.read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
if not keys:
raise ValueError("No GEMINI_KEY_* found in .env")
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
# ── Classification ──────────────────────────────────────────────────────────
def classify_domain(content, summary, subdomains, key):
"""Call Gemini Flash to classify into one of 18 domains."""
prompt = CLASSIFY_PROMPT.format(
content=str(content)[:400] if content else "(none)",
summary=str(summary)[:200] if summary else "(none)",
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
)
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "text/plain"}
)
for retry in range(4):
try:
resp = model.generate_content(prompt)
value = resp.text.strip().strip('"').strip("'").strip()
if value in VALID_DOMAINS:
return value
# Try case-insensitive match
for valid in VALID_DOMAINS:
if value.lower() == valid.lower():
return valid
# Partial match — Gemini sometimes returns with trailing period
clean = value.rstrip('.')
if clean in VALID_DOMAINS:
return clean
# Invalid — retry with stricter prompt
if retry < 3:
prompt = (
f"Your previous response '{value}' was invalid. "
f"You must return ONLY one of these exact strings: {DOMAIN_LIST_STR}\n\n"
f"Content: {str(content)[:300]}\n"
f"Return ONLY the exact domain string."
)
continue
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
else:
log.warning(f"Gemini error (attempt {retry+1}): {e}")
if retry >= 2:
break
return heuristic_fallback(content, summary, subdomains)
def heuristic_fallback(content, summary, subdomains):
"""Last-resort heuristic when Gemini fails or returns invalid."""
text = f"{summary or ''} {' '.join(subdomains or [])} {str(content or '')[:200]}".lower()
mapping = [
(["farming", "agriculture", "livestock", "animal husbandry", "poultry",
"cattle", "crop", "soil fertility", "irrigation for crops"], "Agriculture & Livestock"),
(["foraging", "hunting", "fishing", "bushcraft", "wilderness", "survival skill",
"fire starting", "shelter building", "trapping", "tracking"], "Wilderness Skills"),
(["food preservation", "canning", "dehydration", "smoking", "pickling",
"fermentation", "food storage", "freeze dry"], "Preservation & Storage"),
(["cooking", "recipe", "nutrition", "food preparation", "baking",
"food production", "meal"], "Food Systems"),
(["first aid", "medical", "trauma", "surgery", "anatomy", "pharmacology",
"wound", "triage", "diagnosis", "disease", "infection", "veterinary",
"herbal medicine", "medicinal plant"], "Medical"),
(["radio", "antenna", "ham radio", "communication", "signal",
"networking", "meshtastic", "comms"], "Communications"),
(["solar", "battery", "generator", "wind turbine", "hydroelectric",
"power grid", "inverter", "photovoltaic", "electricity"], "Power Systems"),
(["water purification", "water filter", "well", "rainwater",
"sanitation", "water treatment", "desalination"], "Water Systems"),
(["navigation", "compass", "map reading", "gps", "celestial",
"orienteering", "land nav"], "Navigation"),
(["security", "opsec", "perimeter", "surveillance", "threat",
"intrusion detection", "physical security"], "Security"),
(["vehicle", "engine", "motor", "aircraft", "boat", "motorcycle",
"truck", "maintenance", "diesel", "transmission"], "Vehicles"),
(["tool", "equipment", "wrench", "saw", "drill", "hammer",
"hand tool", "power tool", "blade", "sharpening"], "Tools & Equipment"),
(["construction", "building", "shelter", "carpentry", "masonry",
"roofing", "concrete", "framing", "plumbing"], "Shelter & Construction"),
(["electronics", "computer", "software", "circuit", "programming",
"technology", "digital", "engineering"], "Technology"),
(["supply chain", "logistics", "transport", "distribution",
"inventory", "supply", "stockpile"], "Logistics"),
(["governance", "civil", "community", "administration", "organization",
"council", "democratic", "municipal"], "Civil Organization"),
(["tactics", "combat", "military", "mission", "patrol", "ambush",
"defensive position", "fire team", "maneuver", "engagement",
"search and rescue", "sar", "reconnaissance"], "Operations"),
]
for keywords, domain in mapping:
if any(kw in text for kw in keywords):
return domain
return DOMAIN_FALLBACK
# ── Checkpoint ──────────────────────────────────────────────────────────────
class Checkpoint:
"""Thread-safe checkpoint tracker for crash recovery."""
def __init__(self, path):
self.path = path
self._lock = threading.Lock()
self._completed = set()
self._dirty = 0
self._load()
def _load(self):
if self.path.exists():
try:
data = json.loads(self.path.read_text())
self._completed = set(data.get("completed", []))
log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
except Exception:
self._completed = set()
def is_done(self, point_id):
return point_id in self._completed
def mark_done(self, point_id):
with self._lock:
self._completed.add(point_id)
self._dirty += 1
if self._dirty >= 1000:
self._flush()
def _flush(self):
tmp = self.path.with_suffix('.tmp')
tmp.write_text(json.dumps({"completed": list(self._completed)}))
tmp.rename(self.path)
self._dirty = 0
def flush(self):
with self._lock:
self._flush()
def count(self):
return len(self._completed)
# ── Per-point processing ───────────────────────────────────────────────────
def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run, stats):
point_id = point.id
if checkpoint.is_done(point_id):
return "skipped"
payload = point.payload
content = payload.get("content", payload.get("summary", ""))
summary = payload.get("summary", "")
subdomains = payload.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
old_domain = payload.get("domain", [])
if isinstance(old_domain, list):
old_domain_str = old_domain[0] if old_domain else "(empty)"
else:
old_domain_str = str(old_domain)
key = key_rotator.next()
new_domain = classify_domain(content, summary, subdomains, key)
# Track the mapping
stats_key = f"{old_domain_str} -> {new_domain}"
stats[stats_key] = stats.get(stats_key, 0) + 1
if dry_run:
return f"would: {old_domain_str} -> {new_domain}"
# Write new domain as single string
qdrant.set_payload(
collection_name=collection,
payload={"domain": new_domain},
points=[point_id],
)
checkpoint.mark_done(point_id)
return "ok"
# ── Main loop ───────────────────────────────────────────────────────────────
SCROLL_BATCH = 5000
def count_source_domains(qdrant, collection):
"""Count vectors with source domains."""
counts = {}
for domain in SOURCE_DOMAINS:
result = qdrant.count(
collection_name=collection,
count_filter=Filter(
must=[FieldCondition(key="domain", match=MatchValue(value=domain))]
),
exact=True,
)
counts[domain] = result.count
return counts
def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None, dry_run=False):
"""Scroll source domains in batches, process with thread pool."""
lock = threading.Lock()
done = 0
skipped_checkpoint = 0
start = time.time()
stats = {} # shared mapping stats
for source_domain in sorted(SOURCE_DOMAINS):
log.info(f"\n--- Processing domain: {source_domain} ---")
offset = None
domain_done = 0
while True:
scroll_results, offset = qdrant.scroll(
collection_name=collection,
limit=SCROLL_BATCH,
with_payload=True,
with_vectors=False,
offset=offset,
scroll_filter=Filter(
must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
),
)
if not scroll_results:
if offset is None:
break
continue
# Filter already checkpointed
pending = [p for p in scroll_results if not checkpoint.is_done(p.id)]
skipped_checkpoint += len(scroll_results) - len(pending)
if pending:
with ThreadPoolExecutor(max_workers=workers) as ex:
futures = {
ex.submit(process_point, p, qdrant, collection, rotator,
checkpoint, dry_run, stats): p
for p in pending
}
for future in as_completed(futures):
try:
future.result()
except Exception as e:
log.error(f"Worker error: {e}")
with lock:
done += 1
domain_done += 1
if done % 5000 == 0:
elapsed = time.time() - start
rate = done / elapsed * 60
log.info(f" {done:,} done | {rate:.0f}/min | "
f"elapsed {elapsed/60:.1f}min")
checkpoint.flush()
time.sleep(0.02)
if limit and done >= limit:
break
if offset is None:
break
log.info(f" {source_domain}: {domain_done:,} vectors processed")
if limit and done >= limit:
break
checkpoint.flush()
return done, skipped_checkpoint, stats, start
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true",
help="Classify 20 samples without writing")
parser.add_argument("--workers", type=int, default=16)
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
keys = load_gemini_keys()
rotator = KeyRotator(keys)
qdrant = QdrantClient(host="localhost", port=6333, timeout=120)
collection = "recon_knowledge"
checkpoint = Checkpoint(CHECKPOINT_FILE)
# Count source domains
counts = count_source_domains(qdrant, collection)
total_source = sum(counts.values())
pre_checkpoint = checkpoint.count()
log.info(f"Source domain counts:")
for domain, count in sorted(counts.items(), key=lambda x: -x[1]):
log.info(f" {domain:30s} {count:>10,}")
log.info(f" {'TOTAL':30s} {total_source:>10,}")
log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
log.info(f"Workers: {args.workers} | Keys: {len(keys)}")
# Cost estimate
remaining = total_source - pre_checkpoint
input_tokens = remaining * 200
output_tokens = remaining * 5
input_cost = input_tokens / 1_000_000 * 0.10
output_cost = output_tokens / 1_000_000 * 0.40
total_cost = input_cost + output_cost
log.info(f"\nEstimated Gemini 2.0 Flash cost:")
log.info(f" Vectors to process: {remaining:,}")
log.info(f" Input: ~{input_tokens/1_000_000:.1f}M tokens = ${input_cost:.2f}")
log.info(f" Output: ~{output_tokens/1_000_000:.1f}M tokens = ${output_cost:.2f}")
log.info(f" TOTAL: ~${total_cost:.2f}")
if args.dry_run:
log.info(f"\nDRY RUN: classifying 20 samples...\n")
for source_domain in sorted(SOURCE_DOMAINS):
scroll_results, _ = qdrant.scroll(
collection_name=collection,
limit=5,
with_payload=True,
with_vectors=False,
scroll_filter=Filter(
must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
),
)
for p in scroll_results[:4]:
pay = p.payload
title = pay.get("title", "(no title)")
content = pay.get("content", pay.get("summary", ""))
summary = pay.get("summary", "")
subdomains = pay.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
key = rotator.next()
new_domain = classify_domain(content, summary, subdomains, key)
old = pay.get("domain", [])
if isinstance(old, list):
old = old[0] if old else "?"
print(f" [{old:25s}] -> [{new_domain:25s}] {title[:60]}")
print(f"\nDRY RUN complete. ~{remaining:,} vectors would be migrated.")
print(f"Estimated cost: ~${total_cost:.2f}")
return
# ── Full migration ──────────────────────────────────────────────────
log.info(f"\nStarting full migration...")
done, skipped_ckpt, stats, start = stream_and_process(
qdrant, collection, rotator, checkpoint, args.workers, args.limit
)
elapsed = time.time() - start
log.info(f"\n{'='*70}")
log.info(f"MIGRATION COMPLETE in {elapsed/60:.1f}min:")
log.info(f" Processed: {done:,}")
log.info(f" Skipped (checkpoint): {skipped_ckpt:,}")
log.info(f" Rate: {done/elapsed*60:.0f}/min")
log.info(f"\nMapping distribution:")
for mapping, count in sorted(stats.items(), key=lambda x: -x[1])[:30]:
log.info(f" {mapping:<55s} {count:>8,}")
if __name__ == "__main__":
main()

469
scripts/migrate_skill_level.py Executable file
View file

@ -0,0 +1,469 @@
#!/usr/bin/env python3
"""
migrate_skill_level.py Replaces skill_level with knowledge_type + complexity
on all vectors in Qdrant and on-disk concept JSONs.
Scrolls entire collection, classifies each concept via Gemini Flash,
writes knowledge_type + complexity, deletes skill_level.
Crash-safe: completed point IDs tracked in checkpoint file.
Usage:
python3 /opt/recon/scripts/migrate_skill_level.py [--dry-run] [--workers 16] [--limit N]
"""
import json
import time
import random
import logging
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import FieldCondition, MatchValue, Filter
import sys
sys.path.insert(0, '/opt/recon')
from lib.utils import get_config, setup_logging
# Suppress noisy HTTP request logging from qdrant_client/httpx
import logging as _logging
_logging.getLogger("httpx").setLevel(_logging.WARNING)
_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
LOG_FILE = Path("/opt/recon/logs/migrate_skill_level.log")
CHECKPOINT_FILE = Path("/opt/recon/data/migrate_skill_level_checkpoint.json")
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("migrate_skill_level")
# ── Prompt ──────────────────────────────────────────────────────────────────
CLASSIFY_PROMPT = """\
You are a knowledge classification engine. Given a concept, assign two fields:
knowledge_type what KIND of knowledge this is:
foundational concepts, definitions, theory, background knowledge, explanations of how things work
procedural step-by-step techniques, instructions, how-to skills, methods you execute
operational application under real conditions, decision-making, mission execution, judgment calls in context
complexity how much prior knowledge is needed:
basic requires little or no prior knowledge, introductory material, simple concepts
intermediate requires some domain familiarity, assumes foundational knowledge is in place
advanced requires significant experience or expertise, high-stakes or highly technical material
EXAMPLES:
- "Needle chest decompression procedure" procedural, advanced
- "What is soil texture and why does it matter" foundational, basic
- "Coordinating a fire team withdrawal under contact" operational, advanced
- "How to start a campfire with a ferro rod" procedural, basic
- "Antenna gain and radiation patterns explained" foundational, intermediate
- "Triage decision-making in a mass casualty event" operational, advanced
- "Step-by-step: building a Dakota fire hole" procedural, intermediate
- "Understanding the water cycle" foundational, basic
Concept title: {title}
Concept domain: {domain}
Concept subdomain: {subdomain}
Concept content: {content}
Return ONLY valid JSON, no markdown, no explanation:
{{"knowledge_type": "foundational|procedural|operational", "complexity": "basic|intermediate|advanced"}}
"""
VALID_KNOWLEDGE_TYPES = {"foundational", "procedural", "operational"}
VALID_COMPLEXITIES = {"basic", "intermediate", "advanced"}
# ── Key management ──────────────────────────────────────────────────────────
def load_gemini_keys():
keys = []
for line in Path("/opt/recon/.env").read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
# ── Classification ──────────────────────────────────────────────────────────
def classify(title, domains, subdomains, content, key):
"""Call Gemini Flash to classify knowledge_type + complexity."""
prompt = CLASSIFY_PROMPT.format(
title=title or "(untitled)",
domain=", ".join(domains[:5]) if domains else "(none)",
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
content=str(content)[:400] if content else "(none)",
)
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "application/json"}
)
for retry in range(4):
try:
resp = model.generate_content(prompt)
data = json.loads(resp.text)
kt = data.get("knowledge_type", "").lower().strip()
cx = data.get("complexity", "").lower().strip()
if kt in VALID_KNOWLEDGE_TYPES and cx in VALID_COMPLEXITIES:
return kt, cx
# Invalid values — retry once
if retry == 0:
continue
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
else:
break
# Fallback heuristic based on old skill_level + content analysis
return heuristic_fallback(title, subdomains, content)
def heuristic_fallback(title, subdomains, content):
"""Last-resort heuristic when Gemini fails."""
text = f"{title} {' '.join(subdomains)} {str(content)[:200]}".lower()
# Knowledge type heuristic
procedural_signals = ["how to", "step-by-step", "procedure", "instructions",
"method", "technique", "build", "make", "construct",
"install", "assemble", "recipe", "prepare"]
operational_signals = ["decision", "coordinate", "execute", "deploy",
"mission", "triage", "under fire", "in the field",
"real-world", "scenario", "assessment", "plan"]
if any(s in text for s in operational_signals):
kt = "operational"
elif any(s in text for s in procedural_signals):
kt = "procedural"
else:
kt = "foundational"
# Complexity heuristic — default intermediate (safest middle ground)
cx = "intermediate"
basic_signals = ["introduction", "what is", "basic", "beginner", "overview",
"definition", "simple", "fundamentals"]
advanced_signals = ["advanced", "expert", "complex", "critical", "high-stakes",
"surgery", "trauma", "tactical", "classified"]
if any(s in text for s in basic_signals):
cx = "basic"
elif any(s in text for s in advanced_signals):
cx = "advanced"
return kt, cx
# ── Checkpoint management ───────────────────────────────────────────────────
class Checkpoint:
"""Thread-safe checkpoint tracker for crash recovery."""
def __init__(self, path):
self.path = path
self._lock = threading.Lock()
self._completed = set()
self._dirty = 0
self._load()
def _load(self):
if self.path.exists():
try:
data = json.loads(self.path.read_text())
self._completed = set(data.get("completed", []))
log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
except Exception:
self._completed = set()
def is_done(self, point_id):
return point_id in self._completed
def mark_done(self, point_id):
with self._lock:
self._completed.add(point_id)
self._dirty += 1
if self._dirty >= 1000:
self._flush()
def _flush(self):
tmp = self.path.with_suffix('.tmp')
tmp.write_text(json.dumps({"completed": list(self._completed)}))
tmp.rename(self.path)
self._dirty = 0
def flush(self):
with self._lock:
self._flush()
def count(self):
return len(self._completed)
# ── Concept JSON update ────────────────────────────────────────────────────
def update_concept_json(doc_hash, title, knowledge_type, complexity):
"""Update on-disk concept JSON: add knowledge_type + complexity, remove skill_level."""
doc_dir = CONCEPTS_DIR / doc_hash
if not doc_dir.exists():
return False
for wf in doc_dir.glob("window_*.json"):
try:
with open(wf, "r", encoding="utf-8") as f:
concepts = json.load(f)
changed = False
for c in concepts:
if not isinstance(c, dict):
continue
if c.get("title") == title:
c["knowledge_type"] = knowledge_type
c["complexity"] = complexity
c.pop("skill_level", None)
changed = True
if changed:
with open(wf, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
return True
except Exception:
pass
return False
# ── Per-point processing ───────────────────────────────────────────────────
def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run):
point_id = point.id
if checkpoint.is_done(point_id):
return "skipped"
payload = point.payload
title = payload.get("title", "")
domains = payload.get("domain", [])
if isinstance(domains, str):
domains = [domains]
subdomains = payload.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
content = payload.get("content", payload.get("summary", ""))
doc_hash = payload.get("doc_hash", "")
key = key_rotator.next()
knowledge_type, complexity = classify(title, domains, subdomains, content, key)
if dry_run:
return f"kt={knowledge_type}, cx={complexity}"
# Write new fields
qdrant.set_payload(
collection_name=collection,
payload={"knowledge_type": knowledge_type, "complexity": complexity},
points=[point_id],
)
# Delete old field
qdrant.delete_payload(
collection_name=collection,
keys=["skill_level"],
points=[point_id],
)
# Update JSON on disk
if doc_hash:
update_concept_json(doc_hash, title, knowledge_type, complexity)
checkpoint.mark_done(point_id)
return "ok"
# ── Streaming batch processor ───────────────────────────────────────────────
SCROLL_BATCH = 5000 # vectors per scroll batch — keeps memory bounded (~50MB)
def count_collection(qdrant, collection):
"""Quick count of total vectors via collection info."""
info = qdrant.get_collection(collection)
return info.points_count
def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None):
"""Scroll in batches, process each batch with thread pool, then discard.
Memory-bounded: only holds SCROLL_BATCH payloads at any time (~50MB).
"""
results_agg = defaultdict(int)
lock = threading.Lock()
done = 0
skipped_checkpoint = 0
skipped_no_skill = 0
total_estimate = count_collection(qdrant, collection)
start = time.time()
offset = None
batch_num = 0
while True:
batch_num += 1
scroll_results, offset = qdrant.scroll(
collection_name=collection,
limit=SCROLL_BATCH,
with_payload=True,
with_vectors=False,
offset=offset,
)
# Filter to points needing migration
pending = []
for p in scroll_results:
if "skill_level" not in p.payload:
skipped_no_skill += 1
continue
if checkpoint.is_done(p.id):
skipped_checkpoint += 1
continue
pending.append(p)
if pending:
with ThreadPoolExecutor(max_workers=workers) as ex:
futures = {
ex.submit(process_point, p, qdrant, collection, rotator, checkpoint, False): p
for p in pending
}
for future in as_completed(futures):
try:
status = future.result()
except Exception as e:
status = f"error: {str(e)[:80]}"
log.error(f"Worker error: {e}")
with lock:
results_agg[status] += 1
done += 1
if done % 5000 == 0:
elapsed = time.time() - start
rate = done / elapsed * 60
remaining = total_estimate - done - skipped_checkpoint - skipped_no_skill
eta = remaining / (done / elapsed) / 60 if done > 0 else 0
log.info(f" {done:,} done | {rate:.0f}/min | "
f"ETA ~{eta:.0f}min | {dict(results_agg)}")
checkpoint.flush()
time.sleep(0.02)
if limit and done >= limit:
break
if offset is None:
break
checkpoint.flush()
return done, skipped_checkpoint, skipped_no_skill, results_agg, start
# ── Main ────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true",
help="Classify 20 samples without writing anything")
parser.add_argument("--workers", type=int, default=16)
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
config = get_config()
keys = load_gemini_keys()
rotator = KeyRotator(keys)
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=120
)
collection = config['vector_db']['collection']
checkpoint = Checkpoint(CHECKPOINT_FILE)
total_vectors = count_collection(qdrant, collection)
pre_checkpoint = checkpoint.count()
log.info(f"Collection has {total_vectors:,} vectors")
log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
log.info(f"Estimated Gemini Flash cost: ~${(total_vectors - pre_checkpoint) * 0.0004:.2f}")
log.info(f"Streaming in batches of {SCROLL_BATCH:,} (memory-bounded)")
if args.dry_run:
# Scroll one batch, classify 20 diverse samples
log.info(f"\nDRY RUN: classifying 20 samples...\n")
scroll_results, _ = qdrant.scroll(
collection_name=collection,
limit=200,
with_payload=True,
with_vectors=False,
)
samples = []
seen_domains = set()
for p in scroll_results:
if "skill_level" not in p.payload:
continue
domains = p.payload.get("domain", [])
if isinstance(domains, str):
domains = [domains]
d_key = tuple(sorted(domains[:2]))
if d_key not in seen_domains:
samples.append(p)
seen_domains.add(d_key)
if len(samples) >= 20:
break
for i, p in enumerate(samples, 1):
pay = p.payload
title = pay.get("title", "(no title)")
domains = pay.get("domain", [])
old_skill = pay.get("skill_level", "?")
subdomains = pay.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
content = pay.get("content", pay.get("summary", ""))
key = rotator.next()
kt, cx = classify(title, domains, subdomains, content, key)
print(f"\n--- Sample {i}/{len(samples)} ---")
print(f" Title: {title}")
print(f" Domain: {domains}")
print(f" Old skill: {old_skill}")
print(f" → knowledge_type: {kt}")
print(f" → complexity: {cx}")
est = total_vectors - pre_checkpoint
print(f"\nDRY RUN complete. ~{est:,} vectors would be migrated.")
print(f"Estimated Gemini Flash cost: ~${est * 0.0004:.2f}")
return
# ── Full migration run (streaming) ──────────────────────────────────────
done, skipped_ckpt, skipped_no_skill, results, start = stream_and_process(
qdrant, collection, rotator, checkpoint, args.workers, args.limit
)
elapsed = time.time() - start
log.info(f"\nComplete in {elapsed/60:.1f}min:")
log.info(f" Processed: {done:,}")
log.info(f" Skipped (checkpoint): {skipped_ckpt:,}")
log.info(f" Skipped (no skill): {skipped_no_skill:,}")
for status, count in sorted(results.items(), key=lambda x: -x[1]):
log.info(f" {status:<30} {count:>10,}")
if __name__ == "__main__":
main()

227
scripts/rebuild_qdrant.py Executable file
View file

@ -0,0 +1,227 @@
"""
RECON Qdrant Rebuilder patched for headless parallel execution
Deletes and recreates the Qdrant collection, then re-embeds ALL concept JSONs
from disk using parallel workers. Pass --confirm to skip interactive prompt.
Usage:
python3 scripts/rebuild_qdrant.py --confirm [--workers 8]
"""
import json
import os
import sys
import time
import argparse
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import requests as http_requests
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from lib.utils import get_config, concept_id, setup_logging
from lib.status import StatusDB
logger = setup_logging('recon.rebuild')
def embed_content(config, content):
try:
tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/embed"
resp = http_requests.post(tei_url, json={"inputs": content}, timeout=120)
resp.raise_for_status()
return resp.json()[0]
except Exception as tei_err:
logger.debug(f"TEI failed, trying Ollama: {tei_err}")
ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/embed"
resp = http_requests.post(ollama_url, json={
"model": config['embedding']['model'],
"input": content
}, timeout=120)
resp.raise_for_status()
return resp.json()['embeddings'][0]
def process_doc(doc_hash, config, db, qdrant, collection):
"""Embed and upsert all concepts for a single document. Returns (inserted, failed)."""
doc_dir = os.path.join(config['paths']['concepts'], doc_hash)
doc = db.get_document(doc_hash)
filename = doc['filename'] if doc else doc_hash[:8]
window_files = sorted([
f for f in os.listdir(doc_dir)
if f.startswith('window_') and f.endswith('.json')
])
all_concepts = []
for wf in window_files:
path = os.path.join(doc_dir, wf)
try:
with open(path, encoding='utf-8') as f:
concepts = json.load(f)
if isinstance(concepts, list):
all_concepts.extend(concepts)
except Exception as e:
logger.warning(f"Skipping corrupted window {wf} in {doc_hash}: {e}")
if not all_concepts:
return 0, 0
is_web = doc.get('path', '').startswith(('http://', 'https://')) if doc else False
# Check meta.json for explicit source_type (e.g. 'transcript')
source_type = 'web' if is_web else 'document'
text_dir = os.path.join(config['paths']['text'], doc_hash)
meta_path = os.path.join(text_dir, 'meta.json')
if os.path.exists(meta_path):
try:
with open(meta_path) as mf:
meta = json.load(mf)
if meta.get('source_type'):
source_type = meta['source_type']
except Exception:
pass
points = []
failed = 0
batch_size = config['processing']['embed_batch_size']
for idx, concept in enumerate(all_concepts):
content = concept.get('content', '')
if not content or len(content.strip()) < 10:
continue
try:
vector = embed_content(config, content)
except Exception as e:
logger.warning(f"Embedding failed {doc_hash}:{idx}: {e}")
failed += 1
continue
start_page = concept.get('_start_page', 0)
point_id = concept_id(doc_hash, start_page, idx)
payload = {
'doc_hash': doc_hash,
'filename': filename,
'book_title': doc.get('book_title', '') if doc else '',
'book_author': doc.get('book_author', '') if doc else '',
'source_type': source_type,
'verification_status': 'unverified',
'credibility_score': 0.7,
'language': 'en',
}
for field in ['content', 'summary', 'title', 'domain', 'subdomain',
'keywords', 'skill_level', 'key_facts', 'scenario_applicable',
'cross_domain_tags', 'chapter', 'page_ref', 'notes',
'_window', '_start_page']:
if field in concept:
payload[field] = concept[field]
points.append(PointStruct(id=point_id, vector=vector, payload=payload))
if len(points) >= batch_size:
qdrant.upsert(collection_name=collection, points=points)
points = []
if points:
qdrant.upsert(collection_name=collection, points=points)
inserted = len(all_concepts) - failed
if doc:
db.update_status(doc_hash, 'complete', vectors_inserted=inserted)
return inserted, failed
def run_rebuild(workers=8):
config = get_config()
db = StatusDB()
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=60
)
collection = config['vector_db']['collection']
# Delete and recreate
try:
qdrant.delete_collection(collection)
logger.info(f"Deleted collection: {collection}")
except Exception:
pass
qdrant.create_collection(
collection_name=collection,
vectors_config=VectorParams(
size=config['embedding']['dimensions'],
distance=Distance.COSINE
)
)
logger.info(f"Created collection: {collection} ({config['embedding']['dimensions']}d, Cosine)")
concepts_root = config['paths']['concepts']
doc_dirs = sorted([
d for d in os.listdir(concepts_root)
if os.path.isdir(os.path.join(concepts_root, d))
])
logger.info(f"Found {len(doc_dirs)} document concept directories | {workers} workers")
total_inserted = 0
total_failed = 0
done = 0
lock = threading.Lock()
start = time.time()
with ThreadPoolExecutor(max_workers=workers) as ex:
futures = {
ex.submit(process_doc, h, config, StatusDB(), qdrant, collection): h
for h in doc_dirs
}
for future in as_completed(futures):
doc_hash = futures[future]
try:
inserted, failed = future.result()
except Exception as e:
logger.error(f"Worker error {doc_hash}: {e}")
inserted, failed = 0, 0
with lock:
total_inserted += inserted
total_failed += failed
done += 1
if done % 500 == 0:
elapsed = time.time() - start
rate = total_inserted / elapsed if elapsed > 0 else 0
remaining = (len(doc_dirs) - done) / (done / elapsed) if elapsed > 0 else 0
logger.info(
f" [{done}/{len(doc_dirs)}] "
f"{total_inserted:,} vectors | "
f"{rate:.0f}/sec | "
f"ETA {remaining/60:.0f}min"
)
elapsed = time.time() - start
logger.info(f"\nRebuild complete in {elapsed/60:.1f} min: "
f"{total_inserted:,} inserted, {total_failed:,} failed")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--confirm', action='store_true', help='Skip interactive prompt')
parser.add_argument('--workers', type=int, default=8)
args = parser.parse_args()
if not args.confirm:
print("WARNING: This will DELETE and RECREATE the Qdrant collection.")
confirm = input("Type 'REBUILD' to proceed: ")
if confirm != 'REBUILD':
print("Aborted.")
sys.exit(0)
run_rebuild(workers=args.workers)

314
scripts/reenrich_reference.py Executable file
View file

@ -0,0 +1,314 @@
#!/usr/bin/env python3
"""
reenrich_reference.py Re-classifies all remaining Reference-tagged concepts.
Scrolls Qdrant for vectors with domain == ["Reference"] or containing "Reference",
calls Gemini with a hardened prompt that rejects Reference as a valid response,
updates both Qdrant payload and concept JSON on disk.
Usage:
python3 /opt/recon/scripts/reenrich_reference.py [--dry-run] [--workers 16] [--limit N]
"""
import json
import time
import random
import logging
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import FieldCondition, MatchAny, Filter
import sys
sys.path.insert(0, '/opt/recon')
from lib.utils import get_config, setup_logging
LOG_FILE = Path("/opt/recon/logs/reenrich_reference.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("reenrich_reference")
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
CANONICAL_DOMAINS = {
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
}
# Hardened prompt — Reference explicitly forbidden, classification rules detailed
CLASSIFY_PROMPT = """\
You are a knowledge classification engine. Classify this concept into its correct domain.
VALID DOMAINS use ONLY these exact strings:
Defense & Tactics
Sustainment Systems
Off-Grid Systems
Foundational Skills
Communications
Medical
Food Systems
Navigation
Logistics
Power Systems
Leadership
Scenario Playbooks
Water Systems
Security
Community Coordination
FORBIDDEN: Do NOT output "Reference" under any circumstances. It is not a valid domain.
FORBIDDEN: Do NOT output an empty domain list.
CLASSIFICATION RULES:
- First aid, anatomy, pharmacology, herbs, veterinary, austere medicine, wound care Medical
- Food growing, foraging, hunting, fishing, animal husbandry, livestock Sustainment Systems
- Food preservation, canning, fermentation, food storage, dehydrating Food Systems
- Solar, wind, hydro, batteries, generators, inverters, charge controllers Power Systems
- Water sourcing, filtration, purification, sanitation, wells, rainwater Water Systems
- Radio, antennas, mesh networking, SIGINT, amateur radio Communications
- Weapons, tactics, NBC, security operations, field craft Defense & Tactics
- Permaculture, soil science, agroforestry, composting Sustainment Systems
- Shelter, construction, masonry, blacksmithing, woodworking, crafts Foundational Skills
- Navigation, land nav, celestial nav, map reading, compass Navigation
- Emergency planning, disaster prep, scenario planning Scenario Playbooks
- Leadership, governance, community organization Leadership
- Supply chain, transportation, inventory Logistics
- Physical security, perimeter, surveillance Security
- Community building, cooperation, mutual aid Community Coordination
- Biogas, wood gasification, rocket stoves, appropriate technology Off-Grid Systems
If uncertain between two domains, pick the most actionable one for a self-reliant household.
Concept title: {title}
Concept subdomain tags: {subdomain}
Concept content: {content}
Return ONLY valid JSON, no markdown, no explanation:
{{"domain": ["Domain Name"]}}
"""
def load_gemini_keys():
keys = []
for line in Path("/opt/recon/.env").read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
def classify(title, subdomains, content, key, attempt=0):
"""Call Gemini. Rejects Reference. Falls back to subdomain heuristic if needed."""
prompt = CLASSIFY_PROMPT.format(
title=title or "(untitled)",
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
content=str(content)[:400] if content else "(none)",
)
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "application/json"}
)
for retry in range(4):
try:
resp = model.generate_content(prompt)
data = json.loads(resp.text)
domains = [
d for d in data.get("domain", [])
if d in CANONICAL_DOMAINS # strips Reference automatically
]
if domains:
return domains
# Gemini returned Reference or empty — try once more with stronger wording
if retry == 0:
continue
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
else:
break
# Last resort: subdomain keyword heuristic
return subdomain_fallback(subdomains)
SUBDOMAIN_FALLBACK_MAP = [
(["first aid", "trauma", "wound", "anatomy", "pharmacol", "herbal", "medicin", "veterinar", "dental", "surgery"], "Medical"),
(["foraging", "hunting", "fishing", "livestock", "permaculture", "soil", "agroforestry", "mycolog", "mushroom"], "Sustainment Systems"),
(["canning", "preservation", "fermentation", "food storage", "dehydrat"], "Food Systems"),
(["solar", "battery", "generator", "inverter", "wind turbine", "photovoltaic"], "Power Systems"),
(["water purif", "filtration", "sanitation", "well", "rainwater"], "Water Systems"),
(["radio", "antenna", "mesh", "sigint", "amateur radio", "meshtastic"], "Communications"),
(["weapon", "firearm", "tactic", "nbc", "chemical warfare", "ballistic"], "Defense & Tactics"),
(["navigation", "compass", "land nav", "celestial"], "Navigation"),
(["blacksmith", "woodwork", "masonry", "construct", "craft", "pottery"], "Foundational Skills"),
(["biogas", "gasif", "rocket stove", "appropriate tech"], "Off-Grid Systems"),
(["disaster", "emergency prep", "evacuation", "scenario"], "Scenario Playbooks"),
(["leadership", "governance", "community"], "Leadership"),
(["logistics", "supply chain", "transport"], "Logistics"),
(["security", "perimeter", "surveillance"], "Security"),
]
def subdomain_fallback(subdomains):
combined = " ".join(s.lower() for s in subdomains)
for keywords, domain in SUBDOMAIN_FALLBACK_MAP:
if any(kw in combined for kw in keywords):
return [domain]
return ["Foundational Skills"] # absolute last resort
def update_concept_json(doc_hash, title, new_domains):
"""Update domain in concept JSON files on disk."""
doc_dir = CONCEPTS_DIR / doc_hash
if not doc_dir.exists():
return False
for wf in doc_dir.glob("window_*.json"):
try:
with open(wf, "r", encoding="utf-8") as f:
concepts = json.load(f)
changed = False
for c in concepts:
if not isinstance(c, dict):
continue
if c.get("title") == title:
raw = c.get("domain", [])
if isinstance(raw, str):
raw = [raw]
if "Reference" in raw or not [d for d in raw if d in CANONICAL_DOMAINS]:
c["domain"] = new_domains
changed = True
if changed:
with open(wf, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
return True
except Exception:
pass
return False
def process_point(point, qdrant, collection, key_rotator, dry_run):
payload = point.payload
title = payload.get("title", "")
subdomains = payload.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
content = payload.get("content", payload.get("summary", ""))
doc_hash = payload.get("doc_hash", "")
key = key_rotator.next()
new_domains = classify(title, subdomains, content, key)
if dry_run:
return "would_classify"
# Update Qdrant payload
qdrant.set_payload(
collection_name=collection,
payload={"domain": new_domains},
points=[point.id],
)
# Update JSON on disk
if doc_hash:
update_concept_json(doc_hash, title, new_domains)
return "ok"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--workers", type=int, default=16)
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
config = get_config()
keys = load_gemini_keys()
rotator = KeyRotator(keys)
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=60
)
collection = config['vector_db']['collection']
log.info("Scrolling Qdrant for Reference-tagged concepts...")
# Scroll all points containing Reference in domain
offset = None
reference_points = []
while True:
results, offset = qdrant.scroll(
collection_name=collection,
scroll_filter=Filter(
must=[FieldCondition(
key="domain",
match=MatchAny(any=["Reference"])
)]
),
limit=1000,
with_payload=True,
with_vectors=False,
offset=offset,
)
reference_points.extend(results)
if offset is None:
break
if args.limit and len(reference_points) >= args.limit:
reference_points = reference_points[:args.limit]
break
total = len(reference_points)
log.info(f"Found {total:,} Reference-tagged vectors")
log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f}")
if args.dry_run:
log.info(f"DRY RUN: would re-classify {total:,} concepts. Exiting.")
return
results = defaultdict(int)
lock = threading.Lock()
done = 0
start = time.time()
with ThreadPoolExecutor(max_workers=args.workers) as ex:
futures = {
ex.submit(process_point, p, qdrant, collection, rotator, False): p
for p in reference_points
}
for future in as_completed(futures):
status = future.result()
with lock:
results[status] += 1
done += 1
if done % 5000 == 0:
elapsed = time.time() - start
rate = done / elapsed * 60
eta = (total - done) / (done / elapsed) / 60
log.info(f" {done:,}/{total:,} | {rate:.0f}/min | ETA {eta:.0f}min | {dict(results)}")
time.sleep(0.02)
elapsed = time.time() - start
log.info(f"\nComplete in {elapsed/60:.1f}min:")
for status, count in sorted(results.items(), key=lambda x: -x[1]):
log.info(f" {status:<20} {count:>10,}")
if __name__ == "__main__":
main()

315
scripts/repair_corrupted.py Executable file
View file

@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
repair_corrupted.py Repairs window files corrupted by concurrent writes.
Strategy:
1. Read corrupted_windows.txt to get the list of bad files
2. For each bad file, identify the parent doc hash from the path
3. Check if the text directory still exists for that doc
4. If yes: re-run Gemini enrichment on just that window
5. If no text: mark as unrecoverable
6. Report summary
Usage:
python3 /opt/recon/scripts/repair_corrupted.py [--dry-run] [--workers 8]
"""
import json
import time
import random
import logging
import argparse
import re
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
import google.generativeai as genai
CORRUPTED_LIST = Path("/opt/recon/data/corrupted_windows.txt")
TEXT_DIR = Path("/opt/recon/data/text")
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
LOG_FILE = Path("/opt/recon/logs/repair_corrupted.log")
UNRECOVERABLE_LOG = Path("/opt/recon/data/unrecoverable_windows.txt")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(),
]
)
log = logging.getLogger("repair_corrupted")
CANONICAL_DOMAINS = [
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
]
ENRICH_PROMPT = """Extract knowledge concepts from this document text.
A concept is a SELF-CONTAINED piece of knowledge that can stand alone.
For each concept, provide ALL fields:
Required:
- content: Full text of the concept (complete procedure, definition, etc.)
- summary: 1-2 sentence summary
- title: Brief descriptive title
- domain: Array of 1-5 from ONLY these exact strings (no others):
Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
CRITICAL: Do NOT use "Reference". Every concept belongs somewhere specific.
- subdomain: Array of specific subcategories (up to 10)
- keywords: Array of 3-30 searchable terms
- skill_level: novice | intermediate | advanced
- key_facts: Array of specific extractable claims, measurements, data points
Optional (include when present):
- scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki
- cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination
- chapter: Chapter name if identifiable
- page_ref: Page reference
Return JSON array. If no extractable concepts, return [].
Document text:
"""
def load_gemini_keys():
env = Path("/opt/recon/.env")
keys = []
for line in env.read_text().splitlines():
if line.startswith("GEMINI_KEY_"):
keys.append(line.split("=", 1)[1].strip())
return keys
class KeyRotator:
def __init__(self, keys):
self.keys = keys
self._i = 0
self._lock = threading.Lock()
def next(self):
with self._lock:
key = self.keys[self._i % len(self.keys)]
self._i += 1
return key
def repair_json_truncated(text):
"""Last-ditch attempt to salvage a truncated JSON array."""
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
text = re.sub(r',\s*([}\]])', r'\1', text)
try:
return json.loads(text)
except Exception:
pass
# Find last complete object
last_close = -1
depth = 0
in_str = False
esc = False
for i, ch in enumerate(text):
if esc:
esc = False; continue
if ch == '\\' and in_str:
esc = True; continue
if ch == '"' and not esc:
in_str = not in_str; continue
if in_str:
continue
if ch == '{': depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
last_close = i
if last_close > 0:
trimmed = text[:last_close + 1].rstrip().rstrip(',')
open_brackets = trimmed.count('[') - trimmed.count(']')
try:
return json.loads(trimmed + ']' * open_brackets)
except Exception:
pass
return None
def enrich_window_text(text, key):
"""Call Gemini on raw window text, return concepts list."""
genai.configure(api_key=key)
model = genai.GenerativeModel(
"gemini-2.0-flash",
generation_config={"response_mime_type": "application/json"}
)
for attempt in range(4):
try:
resp = model.generate_content(ENRICH_PROMPT + text)
raw = resp.text
try:
result = json.loads(raw)
except Exception:
result = repair_json_truncated(raw)
if isinstance(result, list):
return [c for c in result if isinstance(c, dict)]
elif isinstance(result, dict):
return [result]
return []
except Exception as e:
err = str(e).lower()
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
time.sleep(delay)
else:
log.warning(f" Non-transient error: {e}")
break
return None # failed
def get_window_text(doc_hash, window_filename):
"""Reconstruct window text from page files."""
# Window filename: window_NNNN.json -> window index is NNNN
try:
w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
except (IndexError, ValueError):
return None
text_path = TEXT_DIR / doc_hash
if not text_path.exists():
return None
page_files = sorted([
f for f in text_path.iterdir()
if f.name.startswith('page_') and f.name.endswith('.txt')
])
if not page_files:
return None
# Re-derive which pages this window covered (window_size=5 from config)
window_size = 5
start = w_idx * window_size
window_pages = page_files[start:start + window_size]
if not window_pages:
return None
parts = []
for j, pf in enumerate(window_pages):
try:
text = pf.read_text(encoding='utf-8')
parts.append(f"--- Page {start + j + 1} ---\n{text}")
except Exception:
pass
return "\n\n".join(parts) if parts else None
def repair_file(corrupted_path, key_rotator, dry_run):
"""Attempt to repair a single corrupted window file."""
path = Path(corrupted_path)
# Sanity check -- maybe it fixed itself somehow
try:
with open(path) as f:
existing = json.load(f)
return "already_valid"
except Exception:
pass
# Extract doc hash and window name from path structure
# Expected: /opt/recon/data/concepts/{hash}/window_NNNN.json
doc_hash = path.parent.name
window_filename = path.name
# Get source text for this window
window_text = get_window_text(doc_hash, window_filename)
if not window_text:
return "no_source_text"
if dry_run:
return "would_repair"
# Re-enrich from source text
key = key_rotator.next()
concepts = enrich_window_text(window_text, key)
if concepts is None:
return "enrichment_failed"
# Tag concepts with metadata
try:
w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
window_size = 5
start_page = w_idx * window_size + 1
except Exception:
w_idx = 0
start_page = 0
for c in concepts:
c['_window'] = w_idx + 1
c['_start_page'] = start_page
c['_doc_hash'] = doc_hash
c['_repaired'] = True
# Write repaired file
try:
with open(path, 'w', encoding='utf-8') as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
return "repaired"
except Exception as e:
return "write_error"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--workers", type=int, default=8)
args = parser.parse_args()
if not CORRUPTED_LIST.exists():
log.error(f"Corrupted list not found: {CORRUPTED_LIST}")
log.error("Run Task 1 first to generate it.")
return
keys = load_gemini_keys()
rotator = KeyRotator(keys)
corrupted = []
with open(CORRUPTED_LIST) as f:
for line in f:
parts = line.strip().split('\t')
if parts:
corrupted.append(parts[0])
log.info(f"Repairing {len(corrupted):,} corrupted window files")
log.info(f"Dry run: {args.dry_run} | Workers: {args.workers} | Keys: {len(keys)}")
results = defaultdict(int)
unrecoverable = []
lock = threading.Lock()
with ThreadPoolExecutor(max_workers=args.workers) as ex:
futures = {ex.submit(repair_file, p, rotator, args.dry_run): p for p in corrupted}
done = 0
for future in as_completed(futures):
path = futures[future]
status = future.result()
with lock:
results[status] += 1
if status in ("no_source_text", "enrichment_failed", "write_error"):
unrecoverable.append((path, status))
done += 1
if done % 100 == 0:
log.info(f" {done:,}/{len(corrupted):,} | {dict(results)}")
time.sleep(0.05)
log.info("── Results ─────────────────────────────────────────────────")
for status, count in sorted(results.items(), key=lambda x: -x[1]):
log.info(f" {status:<25} {count:>8,}")
if unrecoverable:
with open(UNRECOVERABLE_LOG, 'w') as f:
for path, reason in unrecoverable:
f.write(f"{path}\t{reason}\n")
log.info(f"\n Unrecoverable: {len(unrecoverable)} — logged to {UNRECOVERABLE_LOG}")
else:
log.info("\n All files repaired successfully.")
if __name__ == "__main__":
main()

178
scripts/validate.py Executable file
View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
RECON Pipeline Validator
Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.
Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.
Usage: python3 scripts/validate.py [--deep]
"""
import json
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from lib.utils import get_config, setup_logging
from lib.status import StatusDB
logger = setup_logging('recon.validate')
def run_validation(deep=False):
config = get_config()
db = StatusDB()
issues = []
warnings = []
print("=== RECON Validation ===\n")
# Check paths
for name, path in config['paths'].items():
if name == 'db':
if not os.path.exists(path):
issues.append(f"Database not found: {path}")
else:
if not os.path.exists(path):
warnings.append(f"Directory missing: {name} = {path}")
# Check library
if not os.path.exists(config['library_root']):
issues.append(f"Library root not found: {config['library_root']}")
# Check Gemini keys
keys = config.get('gemini_keys', [])
if not keys:
warnings.append("No Gemini API keys configured in .env")
else:
print(f" Gemini keys: {len(keys)} configured")
# DB status counts
counts = db.get_status_counts()
cat = counts.get('catalogue', {})
doc = counts.get('documents', {})
print(f" Catalogue: {sum(cat.values())} entries")
print(f" Documents: {sum(doc.values())} entries")
print(f" Complete: {doc.get('complete', 0)}")
print(f" Failed: {doc.get('failed', 0)}")
if deep:
print("\n--- Deep Validation ---\n")
# Check every document in pipeline has corresponding files
all_docs = db.get_all_documents()
text_dir = config['paths']['text']
concepts_dir = config['paths']['concepts']
for d in all_docs:
h = d['hash']
status = d['status']
if status in ('extracted', 'enriched', 'complete'):
doc_text_dir = os.path.join(text_dir, h)
if not os.path.exists(doc_text_dir):
issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")
elif deep:
pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]
if not pages:
issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")
if status in ('enriched', 'complete'):
doc_concepts_dir = os.path.join(concepts_dir, h)
if not os.path.exists(doc_concepts_dir):
issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")
elif deep:
windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]
if not windows:
issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")
else:
for wf in windows:
try:
with open(os.path.join(doc_concepts_dir, wf)) as f:
data = json.load(f)
if not isinstance(data, list):
issues.append(f"[{h[:8]}] {wf}: not a JSON array")
except json.JSONDecodeError:
issues.append(f"[{h[:8]}] {wf}: invalid JSON")
# Check orphaned directories
if os.path.exists(text_dir):
doc_hashes = {d['hash'] for d in all_docs}
for dirname in os.listdir(text_dir):
if dirname not in doc_hashes:
warnings.append(f"Orphaned text dir: {dirname}")
if os.path.exists(concepts_dir):
for dirname in os.listdir(concepts_dir):
if dirname not in doc_hashes:
warnings.append(f"Orphaned concepts dir: {dirname}")
print(f" Checked {len(all_docs)} documents")
# Connectivity checks
print("\n--- Connectivity ---\n")
import requests as http_requests
# Check TEI (primary embedding backend)
try:
tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"
resp = http_requests.get(tei_url, timeout=10)
if resp.status_code == 200:
print(f" TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")
else:
issues.append(f"TEI: HTTP {resp.status_code}")
except Exception as e:
issues.append(f"TEI: unreachable ({e})")
# Check Ollama (fallback)
try:
ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"
resp = http_requests.get(ollama_url, timeout=10)
if resp.status_code == 200:
print(f" Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")
else:
warnings.append(f"Ollama: HTTP {resp.status_code}")
except Exception as e:
warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")
try:
from qdrant_client import QdrantClient
qdrant = QdrantClient(
host=config['vector_db']['host'],
port=config['vector_db']['port'],
timeout=10
)
collections = [c.name for c in qdrant.get_collections().collections]
target = config['vector_db']['collection']
if target in collections:
info = qdrant.get_collection(target)
print(f" Qdrant: OK ({target}: {info.points_count} points)")
else:
issues.append(f"Qdrant: collection {target} not found")
except Exception as e:
issues.append(f"Qdrant: unreachable ({e})")
# Summary
print("\n--- Summary ---\n")
if warnings:
print(f"Warnings ({len(warnings)}):")
for w in warnings:
print(f"{w}")
if issues:
print(f"\nIssues ({len(issues)}):")
for i in issues:
print(f"{i}")
print(f"\nValidation FAILED: {len(issues)} issue(s)")
else:
print("Validation PASSED")
if __name__ == '__main__':
deep = '--deep' in sys.argv
run_validation(deep=deep)