mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
373
scripts/aa_download.py
Executable file
373
scripts/aa_download.py
Executable file
|
|
@ -0,0 +1,373 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
aa_download.py — Anna's Archive bulk downloader for RECON library acquisition.
|
||||
|
||||
For each target book:
|
||||
1. Searches annas-archive.org for the title + author
|
||||
2. Extracts the best PDF match (verified by author/page count)
|
||||
3. Gets the MD5 from the book page
|
||||
4. Attempts download from Libgen mirrors in order
|
||||
5. Verifies downloaded file is a valid PDF
|
||||
6. Writes full acquisition report
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]
|
||||
|
||||
Report output: ~/projects/recon/aa_acquisition_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
|
||||
LOG_FILE = Path("/opt/recon/logs/aa_download.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("aa_download")
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
})
|
||||
|
||||
BASE_AA = "https://annas-archive.gl"
|
||||
|
||||
# Download attempt order — try fastest mirrors first
|
||||
LIBGEN_MIRRORS = [
|
||||
"https://libgen.is/get.php?md5={md5}",
|
||||
"https://libgen.rs/get.php?md5={md5}",
|
||||
"https://libgen.st/get.php?md5={md5}",
|
||||
"https://libgen.li/ads.php?md5={md5}",
|
||||
]
|
||||
|
||||
# ── Target book list ──────────────────────────────────────────────────────────
|
||||
TARGETS = [
|
||||
# (title, author, dest_dir)
|
||||
|
||||
# Medical — Herbalism
|
||||
("Medical Herbalism", "David Hoffmann", "Medical/Herbalism"),
|
||||
("Making Plant Medicine", "Richo Cech", "Medical/Herbalism"),
|
||||
("The Earthwise Herbal Volume 1", "Matthew Wood", "Medical/Herbalism"),
|
||||
("The Earthwise Herbal Volume 2", "Matthew Wood", "Medical/Herbalism"),
|
||||
("Herbal Antibiotics", "Stephen Buhner", "Medical/Herbalism"),
|
||||
("Herbal Antivirals", "Stephen Buhner", "Medical/Herbalism"),
|
||||
("The Herbal Medicine-Maker's Handbook", "James Green", "Medical/Herbalism"),
|
||||
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "Medical/Herbalism"),
|
||||
|
||||
# Medical — Austere
|
||||
("Wilderness Medicine", "Paul Auerbach", "Medical/Austere"),
|
||||
("Medicine for Mountaineering", "James Wilkerson", "Medical/Austere"),
|
||||
|
||||
# Medical — Veterinary
|
||||
("The Chicken Health Handbook", "Gail Damerow", "Medical/Veterinary"),
|
||||
("Goat Husbandry", "David Mackenzie", "Medical/Veterinary"),
|
||||
|
||||
# Power Systems
|
||||
("The Renewable Energy Handbook", "William Kemp", "Power"),
|
||||
("Homebrew Wind Power", "Dan Bartmann", "Power"),
|
||||
("Wind Energy Basics", "Paul Gipe", "Power"),
|
||||
("12-Volt Bible", "Brotherton", "Power"),
|
||||
("Wiring a House", "Rex Cauldwell", "Power"),
|
||||
|
||||
# Navigation
|
||||
("Wilderness Navigation", "Bob Burns", "Navigation"),
|
||||
("Be Expert with Map and Compass", "Bjorn Kjellstrom", "Navigation"),
|
||||
("Emergency Navigation", "David Burch", "Navigation"),
|
||||
("The Natural Navigator", "Tristan Gooley", "Navigation"),
|
||||
("The Essential Wilderness Navigator", "David Seidman", "Navigation"),
|
||||
|
||||
# Water Systems
|
||||
("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster", "Water"),
|
||||
("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster", "Water"),
|
||||
("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster", "Water"),
|
||||
("Water Storage", "Art Ludwig", "Water"),
|
||||
("The Home Water Supply", "Stu Campbell", "Water"),
|
||||
|
||||
# Food Systems
|
||||
("The Art of Fermentation", "Sandor Katz", "Food"),
|
||||
("Fermented Vegetables", "Kirsten Shockey", "Food"),
|
||||
("Mastering Artisan Cheesemaking", "Gianaclis Caldwell", "Food"),
|
||||
("Home Cheese Making", "Ricki Carroll", "Food"),
|
||||
("The Art of Natural Cheesemaking", "David Asher", "Food"),
|
||||
|
||||
# Permaculture
|
||||
("Edible Forest Gardens Volume 1", "Dave Jacke", "Permaculture"),
|
||||
("Edible Forest Gardens Volume 2", "Dave Jacke", "Permaculture"),
|
||||
("Creating a Forest Garden", "Martin Crawford", "Permaculture"),
|
||||
("Sepp Holzer's Permaculture", "Sepp Holzer", "Permaculture"),
|
||||
("The Permaculture Handbook", "Peter Bane", "Permaculture"),
|
||||
("The Market Gardener", "Jean-Martin Fortier", "Permaculture"),
|
||||
|
||||
# Scenario / Emergency
|
||||
("SAS Survival Handbook", "John Wiseman", "Scenario"),
|
||||
("Pocket Ref", "Thomas Glover", "Scenario"),
|
||||
("Deep Survival", "Laurence Gonzales", "Scenario"),
|
||||
|
||||
# Foundational Skills
|
||||
("Back to Basics", "Reader's Digest", "Skills"),
|
||||
("A Pattern Language", "Christopher Alexander", "Skills"),
|
||||
]
|
||||
|
||||
BASE_LIB = Path("/mnt/library/Acquired")
|
||||
|
||||
|
||||
def search_aa(title, author):
|
||||
"""Search Anna's Archive and return list of candidate result dicts."""
|
||||
query = f"{title} {author}"
|
||||
url = f"{BASE_AA}/search"
|
||||
params = {"q": query, "ext": "pdf", "lang": "en"}
|
||||
try:
|
||||
r = SESSION.get(url, params=params, timeout=20)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
log.warning(f"Search failed for '{title}': {e}")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
results = []
|
||||
|
||||
seen_md5 = set()
|
||||
for item in soup.select("a[href^='/md5/']"):
|
||||
href = item.get("href", "")
|
||||
md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
|
||||
if not md5 or len(md5) != 32:
|
||||
continue
|
||||
text = item.get_text(" ", strip=True)
|
||||
if not text or md5 in seen_md5:
|
||||
continue
|
||||
seen_md5.add(md5)
|
||||
results.append({"md5": md5, "text": text, "href": href})
|
||||
if len(results) >= 5:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_book_details(md5):
|
||||
"""Fetch the book detail page and extract useful metadata."""
|
||||
url = f"{BASE_AA}/md5/{md5}"
|
||||
try:
|
||||
r = SESSION.get(url, timeout=20)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
text = soup.get_text(" ", strip=True)
|
||||
# Extract page count if visible
|
||||
pages = None
|
||||
for word in text.split():
|
||||
if word.isdigit() and 50 < int(word) < 5000:
|
||||
pages = int(word)
|
||||
break
|
||||
return {"pages": pages, "text": text[:500]}
|
||||
except Exception as e:
|
||||
log.warning(f"Detail fetch failed for md5={md5}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def try_download(md5, dest_path):
|
||||
"""Try each libgen mirror until one works. Returns True on success."""
|
||||
for mirror_tpl in LIBGEN_MIRRORS:
|
||||
url = mirror_tpl.format(md5=md5)
|
||||
try:
|
||||
r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
|
||||
content_type = r.headers.get("content-type", "")
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
# Some mirrors return an HTML ads page before the real file
|
||||
if "text/html" in content_type:
|
||||
# Parse redirect link from ads page
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
dl_link = soup.select_one("a[href*='.pdf']")
|
||||
if not dl_link:
|
||||
dl_link = soup.select_one("a[href*='get.php']")
|
||||
if not dl_link:
|
||||
continue
|
||||
actual_url = dl_link["href"]
|
||||
if not actual_url.startswith("http"):
|
||||
actual_url = f"https://libgen.is{actual_url}"
|
||||
r = SESSION.get(actual_url, timeout=120, stream=True)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
|
||||
# Stream to disk
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in r.iter_content(8192):
|
||||
f.write(chunk)
|
||||
|
||||
# Verify it's a real PDF
|
||||
with open(dest_path, "rb") as f:
|
||||
header = f.read(4)
|
||||
if header == b"%PDF":
|
||||
size_mb = dest_path.stat().st_size / 1024 / 1024
|
||||
log.info(f" [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
|
||||
return True
|
||||
else:
|
||||
log.warning(f" [BAD] Not a PDF from {url}")
|
||||
dest_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f" Mirror failed {url}: {e}")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def process_book(title, author, subdir, dry_run):
|
||||
"""Full search + download pipeline for one book."""
|
||||
log.info(f"[SEARCH] '{title}' — {author}")
|
||||
result = {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"status": "NOT FOUND",
|
||||
"md5": "",
|
||||
"pages": "",
|
||||
"file": "",
|
||||
"notes": "",
|
||||
}
|
||||
|
||||
candidates = search_aa(title, author)
|
||||
if not candidates:
|
||||
result["notes"] = "No results from AA search"
|
||||
return result
|
||||
|
||||
# Pick best candidate — prefer one whose text contains author name
|
||||
best = None
|
||||
for c in candidates:
|
||||
if author.split()[-1].lower() in c["text"].lower():
|
||||
best = c
|
||||
break
|
||||
if not best:
|
||||
best = candidates[0] # take first result if no author match
|
||||
|
||||
md5 = best["md5"]
|
||||
result["md5"] = md5
|
||||
|
||||
details = get_book_details(md5)
|
||||
result["pages"] = details.get("pages", "")
|
||||
|
||||
if dry_run:
|
||||
result["status"] = "DRY RUN — found"
|
||||
result["notes"] = f"MD5: {md5}"
|
||||
return result
|
||||
|
||||
# Build destination path
|
||||
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
|
||||
safe_author = author.split()[-1]
|
||||
filename = f"{safe_title}_{safe_author}.pdf"
|
||||
dest = BASE_LIB / subdir / filename
|
||||
|
||||
if dest.exists():
|
||||
result["status"] = "ALREADY EXISTS"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
|
||||
log.info(f" MD5: {md5} — attempting download...")
|
||||
ok = try_download(md5, dest)
|
||||
|
||||
if ok:
|
||||
result["status"] = "DOWNLOADED"
|
||||
result["file"] = str(dest)
|
||||
else:
|
||||
result["status"] = "MD5 ONLY"
|
||||
result["notes"] = f"All mirrors failed. MD5: {md5}"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def write_report(results):
|
||||
REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = [r for r in results if r["status"] == "DOWNLOADED"]
|
||||
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
|
||||
not_found = [r for r in results if r["status"] == "NOT FOUND"]
|
||||
already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]
|
||||
|
||||
lines = [
|
||||
f"# Anna's Archive Acquisition Report",
|
||||
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
||||
f"**Total searched:** {len(results)}",
|
||||
f"",
|
||||
f"| Status | Count |",
|
||||
f"|--------|-------|",
|
||||
f"| Downloaded | {len(downloaded)} |",
|
||||
f"| MD5 only (mirrors failed) | {len(md5_only)} |",
|
||||
f"| Not found on AA | {len(not_found)} |",
|
||||
f"| Already in library | {len(already_have)} |",
|
||||
f"",
|
||||
]
|
||||
|
||||
if downloaded:
|
||||
lines += ["## Downloaded", ""]
|
||||
lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
|
||||
for r in downloaded:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
|
||||
lines.append("")
|
||||
|
||||
if md5_only:
|
||||
lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
|
||||
lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
|
||||
for r in md5_only:
|
||||
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
|
||||
lines.append("")
|
||||
|
||||
if not_found:
|
||||
lines += ["## Not Found on Anna's Archive", ""]
|
||||
lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
|
||||
for r in not_found:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
|
||||
lines.append("")
|
||||
|
||||
if already_have:
|
||||
lines += ["## Already in Library", ""]
|
||||
lines += ["| Title | Author |", "|-------|--------|"]
|
||||
for r in already_have:
|
||||
lines.append(f"| {r['title']} | {r['author']} |")
|
||||
lines.append("")
|
||||
|
||||
REPORT_PATH.write_text("\n".join(lines))
|
||||
log.info(f"Report written to {REPORT_PATH}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
targets = TARGETS[:args.limit] if args.limit else TARGETS
|
||||
log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")
|
||||
|
||||
results = []
|
||||
for i, (title, author, subdir) in enumerate(targets, 1):
|
||||
log.info(f"[{i}/{len(targets)}]")
|
||||
result = process_book(title, author, subdir, args.dry_run)
|
||||
results.append(result)
|
||||
log.info(f" -> {result['status']}")
|
||||
# Polite delay between requests
|
||||
time.sleep(random.uniform(8, 15))
|
||||
|
||||
write_report(results)
|
||||
|
||||
print(f"\n-- Summary -----------------------------------------------")
|
||||
for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
|
||||
count = sum(1 for r in results if r["status"] == status)
|
||||
if count:
|
||||
print(f" {status:<35} {count:>3}")
|
||||
print(f" Report: {REPORT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
478
scripts/aa_download_pass2.py
Executable file
478
scripts/aa_download_pass2.py
Executable file
|
|
@ -0,0 +1,478 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
aa_download_pass2.py — Second-pass downloader for books that failed in pass 1.
|
||||
|
||||
Reads the MD5 list from pass 1 report and tries:
|
||||
1. Z-Library search by title/author (separate catalog from Libgen)
|
||||
2. IPFS gateways using AA's IPFS CID (different from MD5 but findable)
|
||||
3. Alternative Libgen mirrors not tried in pass 1
|
||||
4. Direct AA slow download with longer timeout + retry
|
||||
|
||||
Checkpoint: saves progress to /opt/recon/data/aa_pass2_checkpoint.json
|
||||
so interrupted runs resume where they left off.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/aa_download_pass2.py [--dry-run]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import hashlib
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/aa_download_pass2.log")
|
||||
REPORT_IN = Path.home() / "projects/recon/aa_acquisition_report.md"
|
||||
REPORT_OUT = Path.home() / "projects/recon/aa_acquisition_report_pass2.md"
|
||||
CHECKPOINT = Path("/opt/recon/data/aa_pass2_checkpoint.json")
|
||||
BASE_LIB = Path("/mnt/library/Acquired")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("aa_pass2")
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
})
|
||||
|
||||
# ── Mirrors to try in order ───────────────────────────────────────────────────
|
||||
MIRRORS = [
|
||||
# Libgen alternatives
|
||||
"https://libgen.li/ads.php?md5={md5}",
|
||||
"https://library.lol/main/{md5}",
|
||||
"https://libgen.rocks/get.php?md5={md5}",
|
||||
# Z-Library direct MD5 endpoint (sometimes works)
|
||||
"https://z-library.se/md5/{md5}",
|
||||
# IPFS public gateways — AA uses IPFS for storage
|
||||
"https://cloudflare-ipfs.com/ipfs/{md5}",
|
||||
"https://ipfs.io/ipfs/{md5}",
|
||||
"https://gateway.pinata.cloud/ipfs/{md5}",
|
||||
]
|
||||
|
||||
# ── Books that failed in pass 1 — title, author, md5, subdir ─────────────────
|
||||
PASS1_FAILURES = [
|
||||
# Medical/Herbalism
|
||||
("The Earthwise Herbal Volume 1", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
|
||||
("The Earthwise Herbal Volume 2", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
|
||||
("Herbal Antibiotics", "Stephen Buhner", "5839dab78edfdff0d7986fac62b814da", "Medical/Herbalism"),
|
||||
("The Herbal Medicine-Maker's Handbook", "James Green", "27e8e8a3585705ed194029b69c7d61b1", "Medical/Herbalism"),
|
||||
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "9b1966f20a32ab4331bfece167be1dd0", "Medical/Herbalism"),
|
||||
|
||||
# Medical/Austere
|
||||
("Wilderness Medicine", "Paul Auerbach", "957818eaa4ec40527bb05902f9ef7c51", "Medical/Austere"),
|
||||
("Medicine for Mountaineering", "James Wilkerson", "39cb07998f2034206f0c9472e44cb0b4", "Medical/Austere"),
|
||||
|
||||
# Medical/Veterinary
|
||||
("The Chicken Health Handbook", "Gail Damerow", "0ba42fbea034b9a08ec8e2f8d7606efe", "Medical/Veterinary"),
|
||||
|
||||
# Power
|
||||
("The Renewable Energy Handbook", "William Kemp", "475d89fa80aea6c45aa4b1b4b9c5e274", "Power"),
|
||||
("Homebrew Wind Power", "Dan Bartmann", "0578696d5b1b6bceb3e5e3302c1a31aa", "Power"),
|
||||
("Wind Energy Basics", "Paul Gipe", "ccbe9d22e0a5e32d61921d20d66a8e05", "Power"),
|
||||
("12-Volt Bible", "Brotherton", "3f964fa6d730fdf2c3d3e231e87cf692", "Power"),
|
||||
("Wiring a House", "Rex Cauldwell", "5efcb53450e9eb560210eee40678adcf", "Power"),
|
||||
|
||||
# Navigation
|
||||
("Emergency Navigation", "David Burch", "25e4def9e777b3fa9ca935134732ff9d", "Navigation"),
|
||||
|
||||
# Water
|
||||
("Water Storage", "Art Ludwig", "17c965ec15c6cf4f09b5377b599a5266", "Water"),
|
||||
("The Home Water Supply", "Stu Campbell", "9b22677d2f8e8b39f7a6bf032187295b", "Water"),
|
||||
|
||||
# Food
|
||||
("Fermented Vegetables", "Kirsten Shockey", "74d3bde876b4c17be66c21fdfa85213e", "Food"),
|
||||
("The Art of Natural Cheesemaking", "David Asher", "bc0e0829d701fea9beca912d39f8cc74", "Food"),
|
||||
|
||||
# Permaculture
|
||||
("Edible Forest Gardens Volume 1", "Dave Jacke", "6b069c3bb077fdd89d487a363c070fbb", "Permaculture"),
|
||||
("Edible Forest Gardens Volume 2", "Dave Jacke", "699255bfde7f69285c132a94ec291bf4", "Permaculture"),
|
||||
("Creating a Forest Garden", "Martin Crawford", "96d71d70dba31ae86e14845f913e557e", "Permaculture"),
|
||||
("Sepp Holzer's Permaculture", "Sepp Holzer", "32be55a9fce3e31cacd6912069abb410", "Permaculture"),
|
||||
("The Permaculture Handbook", "Peter Bane", "08cb4492739fda4d01b5a868a408e4a0", "Permaculture"),
|
||||
("The Market Gardener", "Jean-Martin Fortier", "ac69f6c8c22305b42b539482dc761c19", "Permaculture"),
|
||||
|
||||
# Scenario
|
||||
("SAS Survival Handbook", "John Wiseman", "fa967fd5fcbeb3c9887e22f73e590c64", "Scenario"),
|
||||
("Pocket Ref", "Thomas Glover", "8e4988ce513a4aa75e7e6c00ee36692b", "Scenario"),
|
||||
("Deep Survival", "Laurence Gonzales", "9a907ab13b81ea597407fffdb8ea1b04", "Scenario"),
|
||||
|
||||
# Skills
|
||||
("A Pattern Language", "Christopher Alexander","7f5cc06b5399b65a278c4005ccd8d476", "Skills"),
|
||||
]
|
||||
|
||||
|
||||
def load_checkpoint():
|
||||
"""Load checkpoint: dict of {title: result_dict} for completed books."""
|
||||
if CHECKPOINT.exists():
|
||||
try:
|
||||
return json.loads(CHECKPOINT.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_checkpoint(completed):
|
||||
"""Save checkpoint after each book."""
|
||||
CHECKPOINT.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = str(CHECKPOINT) + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
json.dump(completed, f, indent=2)
|
||||
Path(tmp).replace(CHECKPOINT)
|
||||
|
||||
|
||||
def load_md5s_from_report():
|
||||
"""Parse MD5 hashes from pass 1 report to pre-populate PASS1_FAILURES."""
|
||||
if not REPORT_IN.exists():
|
||||
return {}
|
||||
md5_map = {}
|
||||
for line in REPORT_IN.read_text().splitlines():
|
||||
if "`" in line and len(line) > 30:
|
||||
parts = line.split("|")
|
||||
if len(parts) >= 4:
|
||||
title = parts[1].strip()
|
||||
md5_cell = parts[3].strip().strip("`")
|
||||
if len(md5_cell) == 32 and md5_cell.isalnum():
|
||||
md5_map[title.lower()] = md5_cell
|
||||
return md5_map
|
||||
|
||||
|
||||
def search_zlib(title, author):
|
||||
"""Try Z-Library search endpoint."""
|
||||
try:
|
||||
url = "https://z-library.se/s/"
|
||||
params = {"q": f"{title} {author}", "extension[]": "pdf"}
|
||||
r = SESSION.get(url, params=params, timeout=15)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
# Z-lib book links contain /book/
|
||||
for a in soup.select("a[href*='/book/']")[:3]:
|
||||
href = a.get("href", "")
|
||||
if href:
|
||||
book_url = f"https://z-library.se{href}" if href.startswith("/") else href
|
||||
return book_url
|
||||
except Exception as e:
|
||||
log.debug(f"Zlib search failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def try_zlib_download(book_url, dest_path):
|
||||
"""Download from Z-Library book page."""
|
||||
try:
|
||||
r = SESSION.get(book_url, timeout=15)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
dl = soup.select_one("a.addDownloadedBook, a[href*='/dl/'], a.btn-primary[href*='download']")
|
||||
if not dl:
|
||||
return False
|
||||
dl_url = dl["href"]
|
||||
if not dl_url.startswith("http"):
|
||||
dl_url = f"https://z-library.se{dl_url}"
|
||||
r2 = SESSION.get(dl_url, timeout=120, stream=True)
|
||||
if r2.status_code != 200:
|
||||
return False
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in r2.iter_content(8192):
|
||||
f.write(chunk)
|
||||
with open(dest_path, "rb") as f:
|
||||
if f.read(4) == b"%PDF":
|
||||
return True
|
||||
dest_path.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
log.debug(f"Zlib download failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def try_mirrors(md5, dest_path):
|
||||
"""Try all mirrors with the MD5."""
|
||||
import re as _re
|
||||
for tpl in MIRRORS:
|
||||
url = tpl.format(md5=md5)
|
||||
try:
|
||||
r = SESSION.get(url, timeout=20, stream=True, allow_redirects=True)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
ctype = r.headers.get("content-type", "")
|
||||
if "html" in ctype:
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
# For libgen.li ads page, look for get.php with key
|
||||
dl = None
|
||||
match = _re.search(r'href="(get\.php\?md5=[^"]+)"', r.text)
|
||||
if match:
|
||||
actual = f"https://libgen.li/{match.group(1)}"
|
||||
else:
|
||||
dl = (soup.select_one("a[href*='.pdf']") or
|
||||
soup.select_one("a[href*='get.php']") or
|
||||
soup.select_one("a[href*='/get/']"))
|
||||
if not dl:
|
||||
continue
|
||||
actual = dl["href"]
|
||||
if not actual.startswith("http"):
|
||||
base = url.split("/")[0] + "//" + url.split("/")[2]
|
||||
actual = base + ("/" if not actual.startswith("/") else "") + actual
|
||||
|
||||
r = SESSION.get(actual, timeout=60, stream=True)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in r.iter_content(8192):
|
||||
f.write(chunk)
|
||||
with open(dest_path, "rb") as f:
|
||||
if f.read(4) == b"%PDF":
|
||||
size_mb = dest_path.stat().st_size / 1024 / 1024
|
||||
log.info(f" [OK] {size_mb:.1f}MB via {url}")
|
||||
return True
|
||||
dest_path.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
log.debug(f"Mirror {url} failed: {e}")
|
||||
time.sleep(2)
|
||||
return False
|
||||
|
||||
|
||||
def get_ipfs_cids(md5):
|
||||
"""Fetch IPFS CIDs from AA book detail page."""
|
||||
import re as _re
|
||||
cids = []
|
||||
try:
|
||||
r = SESSION.get(f"https://annas-archive.gl/md5/{md5}", timeout=20)
|
||||
if r.status_code == 200:
|
||||
for m in _re.finditer(r'ipfs_cid[:\s]+([A-Za-z0-9]{46,})', r.text):
|
||||
cids.append(m.group(1))
|
||||
# Also check for CIDs in href attributes
|
||||
for m in _re.finditer(r'ipfs://([A-Za-z0-9]{46,})', r.text):
|
||||
if m.group(1) not in cids:
|
||||
cids.append(m.group(1))
|
||||
except Exception as e:
|
||||
log.debug(f"IPFS CID fetch failed: {e}")
|
||||
return cids
|
||||
|
||||
|
||||
def try_ipfs_download(cids, dest_path):
|
||||
"""Try downloading via IPFS public gateways."""
|
||||
gateways = [
|
||||
"https://cloudflare-ipfs.com/ipfs/{}",
|
||||
"https://dweb.link/ipfs/{}",
|
||||
]
|
||||
for cid in cids[:3]: # limit to first 3 CIDs
|
||||
for gw_tpl in gateways:
|
||||
url = gw_tpl.format(cid)
|
||||
try:
|
||||
r = SESSION.get(url, timeout=15, stream=True)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in r.iter_content(8192):
|
||||
f.write(chunk)
|
||||
with open(dest_path, "rb") as f:
|
||||
if f.read(4) == b"%PDF":
|
||||
size_mb = dest_path.stat().st_size / 1024 / 1024
|
||||
log.info(f" [OK] {size_mb:.1f}MB via IPFS {url[:60]}...")
|
||||
return True
|
||||
dest_path.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
log.debug(f"IPFS {url} failed: {e}")
|
||||
time.sleep(1)
|
||||
return False
|
||||
|
||||
|
||||
def search_aa_fresh(title, author):
|
||||
"""Fresh AA search on .gl domain for books that weren't found before."""
|
||||
for domain in ["annas-archive.gl", "annas-archive.se", "annas-archive.org"]:
|
||||
try:
|
||||
url = f"https://{domain}/search"
|
||||
params = {"q": f"{title} {author}", "ext": "pdf", "lang": "en"}
|
||||
r = SESSION.get(url, params=params, timeout=15)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
for a in soup.select("a[href^='/md5/']"):
|
||||
text = a.get_text(" ", strip=True)
|
||||
if not text:
|
||||
continue
|
||||
md5 = a["href"].split("/md5/")[-1].split("/")[0].strip()
|
||||
if len(md5) == 32:
|
||||
if author.split()[-1].lower() in text.lower() or title.split()[0].lower() in text.lower():
|
||||
return md5
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def process_book(title, author, md5_hint, subdir, dry_run):
|
||||
result = {
|
||||
"title": title, "author": author,
|
||||
"status": "NOT FOUND", "md5": md5_hint,
|
||||
"file": "", "notes": "",
|
||||
}
|
||||
|
||||
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
|
||||
safe_author = author.split()[-1]
|
||||
dest = BASE_LIB / subdir / f"{safe_title}_{safe_author}.pdf"
|
||||
|
||||
if dest.exists():
|
||||
result["status"] = "ALREADY EXISTS"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
|
||||
if dry_run:
|
||||
result["status"] = "DRY RUN"
|
||||
return result
|
||||
|
||||
# 1. Try Z-Library first (different catalog)
|
||||
log.info(f" Trying Z-Library...")
|
||||
zlib_url = search_zlib(title, author)
|
||||
if zlib_url:
|
||||
if try_zlib_download(zlib_url, dest):
|
||||
result["status"] = "DOWNLOADED (Z-Library)"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
|
||||
# 2. If no MD5 from pass 1, do a fresh AA search
|
||||
md5 = md5_hint
|
||||
if not md5:
|
||||
log.info(f" Searching AA for fresh MD5...")
|
||||
md5 = search_aa_fresh(title, author)
|
||||
if md5:
|
||||
result["md5"] = md5
|
||||
log.info(f" Found MD5: {md5}")
|
||||
|
||||
# 3. Try IPFS with real CIDs from AA detail page
|
||||
if md5:
|
||||
log.info(f" Fetching IPFS CIDs from AA...")
|
||||
cids = get_ipfs_cids(md5)
|
||||
if cids:
|
||||
log.info(f" Found {len(cids)} IPFS CID(s), trying gateways...")
|
||||
if try_ipfs_download(cids, dest):
|
||||
result["status"] = "DOWNLOADED (IPFS)"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
|
||||
# 4. Try all mirrors with MD5
|
||||
if md5:
|
||||
log.info(f" Trying mirrors with MD5 {md5}...")
|
||||
if try_mirrors(md5, dest):
|
||||
result["status"] = "DOWNLOADED (mirror)"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
result["status"] = "MD5 ONLY"
|
||||
result["notes"] = f"MD5 confirmed, all mirrors failed: {md5}"
|
||||
else:
|
||||
result["notes"] = "Not found on AA or Z-Library"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def write_report(results):
|
||||
downloaded = [r for r in results if "DOWNLOADED" in r["status"]]
|
||||
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
|
||||
not_found = [r for r in results if r["status"] == "NOT FOUND"]
|
||||
existing = [r for r in results if r["status"] == "ALREADY EXISTS"]
|
||||
|
||||
lines = [
|
||||
"# AA Acquisition Report -- Pass 2",
|
||||
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
||||
f"**Searched:** {len(results)} | **Downloaded:** {len(downloaded)} | "
|
||||
f"**MD5 only:** {len(md5_only)} | **Not found:** {len(not_found)}",
|
||||
"",
|
||||
]
|
||||
if downloaded:
|
||||
lines += ["## Downloaded", "",
|
||||
"| Title | Author | Via | File |",
|
||||
"|-------|--------|-----|------|"]
|
||||
for r in downloaded:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['status']} | `{Path(r['file']).name}` |")
|
||||
lines.append("")
|
||||
|
||||
if existing:
|
||||
lines += ["## Already in Library", "",
|
||||
"| Title | Author |",
|
||||
"|-------|--------|"]
|
||||
for r in existing:
|
||||
lines.append(f"| {r['title']} | {r['author']} |")
|
||||
lines.append("")
|
||||
|
||||
if md5_only:
|
||||
lines += ["## MD5 Known -- All Mirrors Failed", "",
|
||||
"| Title | Author | MD5 |",
|
||||
"|-------|--------|-----|"]
|
||||
for r in md5_only:
|
||||
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` |")
|
||||
lines.append("")
|
||||
|
||||
if not_found:
|
||||
lines += ["## Not Found Anywhere", "",
|
||||
"| Title | Author | Notes |",
|
||||
"|-------|--------|-------|"]
|
||||
for r in not_found:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
|
||||
lines.append("")
|
||||
|
||||
REPORT_OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
REPORT_OUT.write_text("\n".join(lines))
|
||||
log.info(f"Report written to {REPORT_OUT}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load any MD5s captured in pass 1
|
||||
md5_map = load_md5s_from_report()
|
||||
targets = []
|
||||
for title, author, md5_hint, subdir in PASS1_FAILURES:
|
||||
md5 = md5_hint or md5_map.get(title.lower(), "")
|
||||
targets.append((title, author, md5, subdir))
|
||||
|
||||
# Load checkpoint
|
||||
completed = load_checkpoint()
|
||||
if completed:
|
||||
log.info(f"Resuming: {len(completed)} books already processed in previous run")
|
||||
|
||||
log.info(f"Pass 2: {len(targets)} books | dry_run={args.dry_run}")
|
||||
results = []
|
||||
for i, (title, author, md5, subdir) in enumerate(targets, 1):
|
||||
# Check checkpoint — skip already-processed books
|
||||
if title in completed and not args.dry_run:
|
||||
result = completed[title]
|
||||
results.append(result)
|
||||
log.info(f"[{i}/{len(targets)}] {title} — SKIPPED (checkpoint: {result['status']})")
|
||||
continue
|
||||
|
||||
log.info(f"[{i}/{len(targets)}] {title} -- {author}")
|
||||
result = process_book(title, author, md5, subdir, args.dry_run)
|
||||
results.append(result)
|
||||
log.info(f" -> {result['status']}")
|
||||
|
||||
# Save checkpoint after each book (not in dry-run)
|
||||
if not args.dry_run:
|
||||
completed[title] = result
|
||||
save_checkpoint(completed)
|
||||
|
||||
time.sleep(random.uniform(6, 12))
|
||||
|
||||
write_report(results)
|
||||
print(f"\n-- Pass 2 Summary ----------------------------------------")
|
||||
for status in ["DOWNLOADED (Z-Library)", "DOWNLOADED (IPFS)", "DOWNLOADED (mirror)", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN"]:
|
||||
count = sum(1 for r in results if r["status"] == status)
|
||||
if count:
|
||||
print(f" {status:<35} {count:>3}")
|
||||
print(f" Report: {REPORT_OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
64
scripts/backup.sh
Executable file
64
scripts/backup.sh
Executable file
|
|
@ -0,0 +1,64 @@
|
|||
#!/bin/bash
|
||||
# RECON Backup Script
|
||||
# Backs up the precious data: concept JSONs, text extracts, SQLite DB
|
||||
# Qdrant is NOT backed up — rebuilt from JSONs via `recon rebuild`
|
||||
# Destination: Contabo VPS (100.64.0.1) via rsync+SSH
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RECON_DIR="/opt/recon"
|
||||
DATA_DIR="$RECON_DIR/data"
|
||||
LOG_FILE="$RECON_DIR/logs/backup.log"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
BACKUP_HOST="root@100.64.0.1"
|
||||
BACKUP_BASE="/opt/backups/recon"
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
mkdir -p "$RECON_DIR/logs"
|
||||
|
||||
log "=== RECON Backup Starting ==="
|
||||
|
||||
# ── 1. SQLite DB (small, fast, critical) ──
|
||||
log "Backing up recon.db..."
|
||||
LOCAL_DB_BACKUP="/tmp/recon_${DATE}.db"
|
||||
sqlite3 "$DATA_DIR/recon.db" ".backup '$LOCAL_DB_BACKUP'"
|
||||
rsync -az "$LOCAL_DB_BACKUP" "$BACKUP_HOST:$BACKUP_BASE/recon_${DATE}.db"
|
||||
rm -f "$LOCAL_DB_BACKUP"
|
||||
# Keep last 7 daily DB backups on remote
|
||||
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/recon_*.db 2>/dev/null | tail -n +8 | xargs rm -f 2>/dev/null || true"
|
||||
log " recon.db backed up"
|
||||
|
||||
# ── 2. Concept JSONs (THE PRECIOUS DATA — $130+ of Gemini work) ──
|
||||
log "Syncing concept JSONs..."
|
||||
rsync -az --delete "$DATA_DIR/concepts/" "$BACKUP_HOST:$BACKUP_BASE/concepts/"
|
||||
CONCEPT_COUNT=$(find "$DATA_DIR/concepts/" -name "*.json" 2>/dev/null | wc -l)
|
||||
log " concepts synced ($CONCEPT_COUNT JSON files)"
|
||||
|
||||
# ── 3. Text extracts (regenerable but expensive in time) ──
|
||||
log "Syncing text extracts..."
|
||||
rsync -az --delete "$DATA_DIR/text/" "$BACKUP_HOST:$BACKUP_BASE/text/"
|
||||
TEXT_COUNT=$(find "$DATA_DIR/text/" -maxdepth 1 -type d 2>/dev/null | wc -l)
|
||||
log " text synced ($((TEXT_COUNT - 1)) document dirs)"
|
||||
|
||||
# ── 4. Intel feeds ──
|
||||
if [ -d "$DATA_DIR/intel" ]; then
|
||||
log "Syncing intel feeds..."
|
||||
rsync -az --delete "$DATA_DIR/intel/" "$BACKUP_HOST:$BACKUP_BASE/intel/"
|
||||
log " intel synced"
|
||||
fi
|
||||
|
||||
# ── 5. Config files ──
|
||||
log "Backing up config..."
|
||||
rsync -az "$RECON_DIR/config.yaml" "$BACKUP_HOST:$BACKUP_BASE/config_${DATE}.yaml"
|
||||
rsync -az "$RECON_DIR/.env" "$BACKUP_HOST:$BACKUP_BASE/env_${DATE}" 2>/dev/null || true
|
||||
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/config_*.yaml 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
|
||||
ssh "$BACKUP_HOST" "ls -t $BACKUP_BASE/env_* 2>/dev/null | tail -n +4 | xargs rm -f 2>/dev/null || true"
|
||||
log " config backed up"
|
||||
|
||||
# ── Summary ──
|
||||
BACKUP_SIZE=$(ssh "$BACKUP_HOST" "du -sh $BACKUP_BASE" | cut -f1)
|
||||
log "=== Backup Complete: $BACKUP_SIZE on Contabo ==="
|
||||
449
scripts/cleanup_outliers.py
Executable file
449
scripts/cleanup_outliers.py
Executable file
|
|
@ -0,0 +1,449 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
cleanup_outliers.py — Three-pass cleanup of RECON concept data.
|
||||
|
||||
Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
|
||||
Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
|
||||
Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import FieldCondition, MatchAny, Filter
|
||||
|
||||
import sys, os
|
||||
sys.path.insert(0, '/opt/recon')
|
||||
from lib.utils import get_config, setup_logging
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("cleanup_outliers")
|
||||
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
DB_PATH = Path("/opt/recon/data/recon.db")
|
||||
|
||||
CANONICAL_DOMAINS = {
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
}
|
||||
|
||||
# Non-canonical → canonical remap
|
||||
OUTLIER_MAP = {
|
||||
"Zoology": "Sustainment Systems",
|
||||
"Botany": "Sustainment Systems",
|
||||
"Nature Lore": "Sustainment Systems",
|
||||
"Ecology": "Sustainment Systems",
|
||||
"Navigational Astronomy": "Navigation",
|
||||
"Troubleshooting": "Foundational Skills",
|
||||
"Chemistry": "Foundational Skills",
|
||||
"Metallurgy": "Foundational Skills",
|
||||
"Weird Science": "Foundational Skills",
|
||||
"Philosophy of physics": "Foundational Skills",
|
||||
"Physics": "Foundational Skills",
|
||||
"Cell biology": "Foundational Skills",
|
||||
"Economics": "Leadership",
|
||||
"Business": "Leadership",
|
||||
"Safety": "Security",
|
||||
"Law Enforcement": "Security",
|
||||
"Security & Intelligence": "Security",
|
||||
"Fire Weather": "Scenario Playbooks",
|
||||
"Legal": "Leadership",
|
||||
# Discard — replace with closest real domain
|
||||
"Site News": "Foundational Skills",
|
||||
"Paleogeography": "Foundational Skills",
|
||||
"Chemical Manipulation": "Foundational Skills",
|
||||
}
|
||||
|
||||
# Junk URL patterns — pages with no knowledge value
|
||||
JUNK_URL_PATTERNS = [
|
||||
# rocketstoves.com nav/template garbage
|
||||
"rocketstoves.com/favicon",
|
||||
"rocketstoves.com/cropped-favicon",
|
||||
"rocketstoves.com/layouts/",
|
||||
"rocketstoves.com/sample",
|
||||
"rocketstoves.com/templates/",
|
||||
"rocketstoves.com/hello-world",
|
||||
"rocketstoves.com/blog-forthcoming",
|
||||
"rocketstoves.com/contact",
|
||||
"rocketstoves.com/acknowledgements",
|
||||
"rocketstoves.com/ja3",
|
||||
"rocketstoves.com/juxtapositions",
|
||||
"rocketstoves.com/no-name-soi",
|
||||
"rocketstoves.com/big4",
|
||||
"rocketstoves.com/roof",
|
||||
"rocketstoves.com/rmh_dloadcover",
|
||||
"rocketstoves.com/pedcover",
|
||||
"rocketstoves.com/laundry-to-landscape",
|
||||
"rocketstoves.com/barreloven",
|
||||
# NRCS calendar/event noise
|
||||
"nrcs.usda.gov/events/",
|
||||
"nrcs.usda.gov/state-offices/massachusetts",
|
||||
"nrcs.usda.gov/state-offices/nebraska",
|
||||
"nrcs.usda.gov/state-offices/oklahoma",
|
||||
"nrcs.usda.gov/state-offices/utah",
|
||||
"nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
|
||||
# deeranddeerhunting trophy hunt videos (no knowledge value)
|
||||
"deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
|
||||
# eattheweeds non-content pages
|
||||
"eattheweeds.com/media-interviews-with-green-deane",
|
||||
"eattheweeds.com/motorcycles-and-mushrooms",
|
||||
"eattheweeds.com/sunny-savage",
|
||||
# foragersharvest nav pages
|
||||
"foragersharvest.com/contact",
|
||||
"foragersharvest.com/podcasts",
|
||||
# motherearthnews classifieds/nav
|
||||
"motherearthnews.com/classifieds/",
|
||||
"motherearthnews.com/biographies/",
|
||||
]
|
||||
|
||||
CLASSIFY_PROMPT = """\
|
||||
Classify this knowledge concept into one or more domains.
|
||||
|
||||
VALID DOMAINS (use ONLY these exact strings):
|
||||
Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
|
||||
Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
|
||||
Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
|
||||
|
||||
Concept title: {title}
|
||||
Concept tags: {subdomain}
|
||||
Concept preview: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown:
|
||||
{{"domain": ["Domain Name"]}}
|
||||
|
||||
Rules:
|
||||
- Never return empty domain list
|
||||
- Medical content, herbs, first aid, veterinary → Medical
|
||||
- Food growing, foraging, hunting, livestock → Sustainment Systems
|
||||
- Food preservation, canning, storage → Food Systems
|
||||
- Solar, wind, batteries, generators → Power Systems
|
||||
- Water sourcing, filtration, sanitation → Water Systems
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
keys = []
|
||||
for line in Path("/opt/recon/.env").read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def classify_concept(title, subdomains, content, key):
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
title=title or "(untitled)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=str(content)[:300] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for attempt in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
|
||||
if domains:
|
||||
return domains
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503"]):
|
||||
time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
|
||||
else:
|
||||
break
|
||||
return ["Foundational Skills"]
|
||||
|
||||
# ── PASS 1: Remap outlier domains ────────────────────────────────────────────
|
||||
|
||||
def remap_concept_domains(domains):
|
||||
"""Remap any outlier domain names in a domain list."""
|
||||
result = set()
|
||||
changed = False
|
||||
for d in domains:
|
||||
if d in CANONICAL_DOMAINS:
|
||||
result.add(d)
|
||||
elif d in OUTLIER_MAP:
|
||||
result.add(OUTLIER_MAP[d])
|
||||
changed = True
|
||||
else:
|
||||
changed = True # drop unknown
|
||||
return list(result), changed
|
||||
|
||||
def pass1_remap_outliers(qdrant, collection, dry_run):
|
||||
log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
|
||||
outlier_names = list(OUTLIER_MAP.keys())
|
||||
stats = defaultdict(int)
|
||||
|
||||
# Scroll through Qdrant finding affected vectors
|
||||
offset = None
|
||||
affected_points = []
|
||||
|
||||
while True:
|
||||
results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(
|
||||
key="domain",
|
||||
match=MatchAny(any=outlier_names)
|
||||
)]
|
||||
),
|
||||
limit=500,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
affected_points.extend(results)
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")
|
||||
|
||||
for point in affected_points:
|
||||
payload = point.payload
|
||||
old_domains = payload.get("domain", [])
|
||||
if isinstance(old_domains, str):
|
||||
old_domains = [old_domains]
|
||||
|
||||
new_domains, changed = remap_concept_domains(old_domains)
|
||||
if not new_domains:
|
||||
new_domains = ["Foundational Skills"]
|
||||
|
||||
if changed:
|
||||
stats["qdrant_updated"] += 1
|
||||
if not dry_run:
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"domain": new_domains},
|
||||
points=[point.id],
|
||||
)
|
||||
|
||||
# Also fix concept JSON files on disk
|
||||
json_fixed = 0
|
||||
for window_file in CONCEPTS_DIR.rglob("window_*.json"):
|
||||
try:
|
||||
with open(window_file, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not isinstance(concepts, list):
|
||||
continue
|
||||
|
||||
file_changed = False
|
||||
for concept in concepts:
|
||||
if not isinstance(concept, dict):
|
||||
continue
|
||||
raw = concept.get("domain", [])
|
||||
if isinstance(raw, str):
|
||||
raw = [raw]
|
||||
new, changed = remap_concept_domains(raw)
|
||||
if changed:
|
||||
concept["domain"] = new if new else ["Foundational Skills"]
|
||||
file_changed = True
|
||||
|
||||
if file_changed:
|
||||
json_fixed += 1
|
||||
if not dry_run:
|
||||
with open(window_file, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
|
||||
log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
|
||||
return stats
|
||||
|
||||
# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────
|
||||
|
||||
def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
|
||||
log.info("=== PASS 2: Re-enriching empty domain concepts ===")
|
||||
stats = defaultdict(int)
|
||||
|
||||
# Find empty domain points in Qdrant
|
||||
offset = None
|
||||
empty_points = []
|
||||
while True:
|
||||
results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
limit=500,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
for r in results:
|
||||
d = r.payload.get("domain", [])
|
||||
if not d or d == [] or d == [""]:
|
||||
empty_points.append(r)
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
log.info(f"Found {len(empty_points)} points with empty domains")
|
||||
|
||||
for point in empty_points:
|
||||
payload = point.payload
|
||||
title = payload.get("title", "")
|
||||
subdomains = payload.get("subdomain", [])
|
||||
content = payload.get("content", payload.get("summary", ""))
|
||||
|
||||
key = key_rotator.next()
|
||||
new_domains = classify_concept(title, subdomains, content, key)
|
||||
stats["classified"] += 1
|
||||
|
||||
if not dry_run:
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"domain": new_domains},
|
||||
points=[point.id],
|
||||
)
|
||||
|
||||
# Also update the concept JSON on disk
|
||||
doc_hash = payload.get("doc_hash", "")
|
||||
if doc_hash:
|
||||
doc_concepts_dir = CONCEPTS_DIR / doc_hash
|
||||
if doc_concepts_dir.exists():
|
||||
for wf in doc_concepts_dir.glob("window_*.json"):
|
||||
try:
|
||||
with open(wf, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
changed = False
|
||||
for c in concepts:
|
||||
if isinstance(c, dict) and c.get("title") == title:
|
||||
d = c.get("domain", [])
|
||||
if not d or d == []:
|
||||
c["domain"] = new_domains
|
||||
changed = True
|
||||
if changed and not dry_run:
|
||||
with open(wf, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time.sleep(0.05)
|
||||
|
||||
log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
|
||||
return stats
|
||||
|
||||
# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────
|
||||
|
||||
def is_junk_url(url):
|
||||
url_lower = url.lower()
|
||||
return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)
|
||||
|
||||
def pass3_purge_junk(qdrant, collection, dry_run):
|
||||
log.info("=== PASS 3: Purging junk URLs ===")
|
||||
stats = defaultdict(int)
|
||||
|
||||
# Scroll all web-source points and find junk
|
||||
offset = None
|
||||
junk_point_ids = []
|
||||
junk_doc_hashes = set()
|
||||
|
||||
while True:
|
||||
results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
|
||||
),
|
||||
limit=500,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
for r in results:
|
||||
filename = r.payload.get("filename", "")
|
||||
doc_hash = r.payload.get("doc_hash", "")
|
||||
if is_junk_url(filename):
|
||||
junk_point_ids.append(r.id)
|
||||
if doc_hash:
|
||||
junk_doc_hashes.add(doc_hash)
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")
|
||||
|
||||
if not dry_run and junk_point_ids:
|
||||
# Delete in batches
|
||||
batch_size = 500
|
||||
for i in range(0, len(junk_point_ids), batch_size):
|
||||
batch = junk_point_ids[i:i + batch_size]
|
||||
qdrant.delete(collection_name=collection, points_selector=batch)
|
||||
log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")
|
||||
|
||||
# Mark junk docs as skipped in SQLite
|
||||
conn = sqlite3.connect(str(DB_PATH))
|
||||
for doc_hash in junk_doc_hashes:
|
||||
conn.execute(
|
||||
"UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
|
||||
(doc_hash,)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")
|
||||
|
||||
stats["junk_vectors"] = len(junk_point_ids)
|
||||
stats["junk_docs"] = len(junk_doc_hashes)
|
||||
log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--skip-pass", type=int, action="append", default=[])
|
||||
args = parser.parse_args()
|
||||
|
||||
config = get_config()
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
collection = config['vector_db']['collection']
|
||||
|
||||
log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")
|
||||
|
||||
if 1 not in args.skip_pass:
|
||||
pass1_remap_outliers(qdrant, collection, args.dry_run)
|
||||
|
||||
if 2 not in args.skip_pass:
|
||||
pass2_empty_domains(qdrant, collection, rotator, args.dry_run)
|
||||
|
||||
if 3 not in args.skip_pass:
|
||||
pass3_purge_junk(qdrant, collection, args.dry_run)
|
||||
|
||||
log.info("All passes complete.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
215
scripts/domain_reenrich.py
Executable file
215
scripts/domain_reenrich.py
Executable file
|
|
@ -0,0 +1,215 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
domain_reenrich.py — Re-enriches solo-Reference concepts that domain_remap.py
|
||||
couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
|
||||
with a lightweight classification-only prompt, updates domain in-place.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
|
||||
|
||||
Reads: /opt/recon/data/remap_unknowns.jsonl
|
||||
Writes: domain field in-place in window JSON files
|
||||
Log: /opt/recon/logs/domain_reenrich.log
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
|
||||
UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
|
||||
LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler(),
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("domain_reenrich")
|
||||
|
||||
CANONICAL_DOMAINS = [
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
]
|
||||
|
||||
DOMAIN_SET = set(CANONICAL_DOMAINS)
|
||||
|
||||
CLASSIFY_PROMPT = """\
|
||||
Classify this knowledge concept into one or more domains.
|
||||
|
||||
VALID DOMAINS (use ONLY these exact strings, no others):
|
||||
{domains}
|
||||
|
||||
Concept title: {title}
|
||||
Concept tags: {subdomain}
|
||||
Concept preview: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown, no explanation:
|
||||
{{"domain": ["Domain Name"]}}
|
||||
|
||||
Rules:
|
||||
- Use only the domain strings listed above, spelled exactly
|
||||
- If genuinely multi-domain assign all that apply
|
||||
- Never return empty domain list — pick the closest match
|
||||
- Medical content, herbs, first aid, veterinary → Medical
|
||||
- Food growing, foraging, hunting, livestock → Sustainment Systems
|
||||
- Food preservation, canning, storage → Food Systems
|
||||
- Solar, wind, batteries, generators → Power Systems
|
||||
- Water sourcing, filtration, sanitation → Water Systems
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
env = Path("/opt/recon/.env")
|
||||
keys = []
|
||||
for line in env.read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def classify_concept(title, subdomains, content, key):
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
domains="\n".join(f" {d}" for d in CANONICAL_DOMAINS),
|
||||
title=title or "(untitled)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=content[:300] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for attempt in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
|
||||
if domains:
|
||||
return domains
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
break
|
||||
return ["Foundational Skills"] # last-resort fallback
|
||||
|
||||
def process_unknown(item, key_rotator):
|
||||
filepath = Path(item["filepath"])
|
||||
title = item.get("title", "")
|
||||
subdomains = item.get("subdomain", [])
|
||||
content = item.get("content_preview", "")
|
||||
|
||||
if not filepath.exists():
|
||||
return "file_missing"
|
||||
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
except Exception:
|
||||
return "read_error"
|
||||
|
||||
if not isinstance(concepts, list):
|
||||
return "not_list"
|
||||
|
||||
# Find this concept by title and update its domain
|
||||
matched = False
|
||||
for concept in concepts:
|
||||
if not isinstance(concept, dict):
|
||||
continue
|
||||
if concept.get("title", "") == title:
|
||||
raw = concept.get("domain", [])
|
||||
if isinstance(raw, str):
|
||||
raw = [raw]
|
||||
# Only re-enrich if still stuck on Reference
|
||||
if raw == ["Reference"] or raw == []:
|
||||
key = key_rotator.next()
|
||||
new_domains = classify_concept(title, subdomains, content, key)
|
||||
concept["domain"] = new_domains
|
||||
concept["_reenriched"] = True
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
return "already_fixed"
|
||||
|
||||
try:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
return "write_error"
|
||||
|
||||
return "ok"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
keys = load_gemini_keys()
|
||||
if not keys:
|
||||
log.error("No Gemini keys found in .env")
|
||||
return
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
unknowns = []
|
||||
with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
unknowns.append(json.loads(line))
|
||||
|
||||
if args.limit:
|
||||
unknowns = unknowns[:args.limit]
|
||||
|
||||
total = len(unknowns)
|
||||
log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
|
||||
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
|
||||
|
||||
results = defaultdict(int)
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
|
||||
for future in as_completed(futures):
|
||||
status = future.result()
|
||||
with lock:
|
||||
results[status] += 1
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
pct = done / total * 100
|
||||
log.info(f" Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
|
||||
time.sleep(0.05)
|
||||
|
||||
log.info("── Final Results ─────────────────────────────────────────────")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<25} {count:>10,}")
|
||||
log.info(f" Total: {total:,}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
428
scripts/domain_remap.py
Executable file
428
scripts/domain_remap.py
Executable file
|
|
@ -0,0 +1,428 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
domain_remap.py — Fix RECON concept domain classifications without API calls.
|
||||
|
||||
What this does:
|
||||
1. Strips "Reference" from concepts that have other real domains
|
||||
2. Remaps variant domain spellings to canonical names
|
||||
3. Reclassifies solo-Reference concepts using their subdomain tags
|
||||
4. Writes a JSONL file of true unknowns for API re-enrichment
|
||||
|
||||
Each window file is a JSON array of concept dicts.
|
||||
Field names: "domain" (list), "subdomain" (list)
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/domain_remap.py --dry-run # report only
|
||||
python3 /opt/recon/scripts/domain_remap.py # apply fixes
|
||||
python3 /opt/recon/scripts/domain_remap.py --workers 16
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
UNKNOWNS_OUTPUT = Path("/opt/recon/data/remap_unknowns.jsonl")
|
||||
|
||||
CANONICAL_DOMAINS = {
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
}
|
||||
|
||||
# Variant → Canonical mapping
|
||||
VARIANT_MAP = {
|
||||
# Defense & Tactics
|
||||
"Tactical Ops": "Defense & Tactics",
|
||||
"Tactical_Ops": "Defense & Tactics",
|
||||
"Tactical Operations": "Defense & Tactics",
|
||||
"Tactical": "Defense & Tactics",
|
||||
"Tactical Skills": "Defense & Tactics",
|
||||
"Tactics": "Defense & Tactics",
|
||||
"Tactics & Defense": "Defense & Tactics",
|
||||
"Reconnaissance": "Defense & Tactics",
|
||||
"Fire Support": "Defense & Tactics",
|
||||
"Improvised Munitions": "Defense & Tactics",
|
||||
"Military Intelligence": "Defense & Tactics",
|
||||
"Military History": "Defense & Tactics",
|
||||
"Military Engineering": "Defense & Tactics",
|
||||
# Medical
|
||||
"Medical Care": "Medical",
|
||||
"Medical Alternatives": "Medical",
|
||||
"Medical/Dental": "Medical",
|
||||
"Medical & Dental": "Medical",
|
||||
"medical": "Medical",
|
||||
"Medical Awareness": "Medical",
|
||||
"Medical Disasters": "Medical",
|
||||
"Medical Emergency Survival": "Medical",
|
||||
"Medical Procedures": "Medical",
|
||||
"Medical Treatment": "Medical",
|
||||
"Medical Science": "Medical",
|
||||
"Medical History": "Medical",
|
||||
"Medical Diagnosis": "Medical",
|
||||
"Medical Skills": "Medical",
|
||||
"Medical Supply": "Medical",
|
||||
"Medical Gear": "Medical",
|
||||
"Medical Kits": "Medical",
|
||||
"Medical Logistics": "Logistics",
|
||||
"Medical First Aid": "Medical",
|
||||
"Medical Ethics": "Medical",
|
||||
"Medical Reference Ranges": "Medical",
|
||||
"Medical andSurgical Hints": "Medical",
|
||||
"Medical Aspects of Radiation Injury": "Medical",
|
||||
"Medical Uses": "Medical",
|
||||
"Medical Care in Developing Countries": "Medical",
|
||||
"Survival Medicine": "Medical",
|
||||
"Emergency War Surgery": "Medical",
|
||||
"First Aid": "Medical",
|
||||
"First Aid and Life Saving": "Medical",
|
||||
"Veterinary Medicine": "Medical",
|
||||
"Veterinary Hygiene": "Medical",
|
||||
"Veterinary": "Medical",
|
||||
"Pharmacology": "Medical",
|
||||
"Public Health": "Medical",
|
||||
"Health": "Medical",
|
||||
# Food Systems
|
||||
"Food_Systems": "Food Systems",
|
||||
"Food_systems": "Food Systems",
|
||||
"food_systems": "Food Systems",
|
||||
"Food Preservation": "Food Systems",
|
||||
"Food Safety": "Food Systems",
|
||||
"Food Security": "Food Systems",
|
||||
"Food & Nutrition": "Food Systems",
|
||||
"Diet & Nutrition": "Food Systems",
|
||||
"Culinary Arts": "Food Systems",
|
||||
"Foodprocessing": "Food Systems",
|
||||
"Food": "Food Systems",
|
||||
# Sustainment Systems
|
||||
"Sustainment_Systems": "Sustainment Systems",
|
||||
"Agriculture": "Sustainment Systems",
|
||||
"Agriculture & Natural Resources": "Sustainment Systems",
|
||||
"Agriculture and Natural Resources": "Sustainment Systems",
|
||||
"Horticulture": "Sustainment Systems",
|
||||
"Gardening": "Sustainment Systems",
|
||||
"Hydroponics": "Sustainment Systems",
|
||||
"Survival Skills": "Sustainment Systems",
|
||||
# Foundational Skills
|
||||
"Foundational_Skills": "Foundational Skills",
|
||||
"Primitive Living Skills": "Foundational Skills",
|
||||
"Woodcraft": "Foundational Skills",
|
||||
"Home Workshop": "Foundational Skills",
|
||||
"Science": "Foundational Skills",
|
||||
"Engineering": "Foundational Skills",
|
||||
"Construction": "Foundational Skills",
|
||||
"Industrial Processes": "Foundational Skills",
|
||||
"Machine Technology": "Foundational Skills",
|
||||
"Training": "Foundational Skills",
|
||||
"Education": "Foundational Skills",
|
||||
# Off-Grid Systems
|
||||
"Off-Grid_Systems": "Off-Grid Systems",
|
||||
"Appropriate Technology": "Off-Grid Systems",
|
||||
# Power Systems
|
||||
"Homebrewed Electricity": "Power Systems",
|
||||
"Renewable Energy": "Power Systems",
|
||||
"Renewable Energy FAQs": "Power Systems",
|
||||
"Alternative Fuels": "Power Systems",
|
||||
"Power_Systems": "Power Systems",
|
||||
# Water Systems
|
||||
"Water_Systems": "Water Systems",
|
||||
# Community Coordination
|
||||
"Community_Coordination": "Community Coordination",
|
||||
"Community_coordination": "Community Coordination",
|
||||
"Community": "Community Coordination",
|
||||
# Leadership
|
||||
"Leadership & Planning": "Leadership",
|
||||
"Planning": "Leadership",
|
||||
"Administration": "Leadership",
|
||||
"Governance": "Leadership",
|
||||
"Government": "Leadership",
|
||||
# Communications
|
||||
"Emergency Communications": "Communications",
|
||||
# Security
|
||||
"Security Systems": "Security",
|
||||
# Logistics
|
||||
"Transportation": "Logistics",
|
||||
# Scenario Playbooks
|
||||
"General Preparedness": "Scenario Playbooks",
|
||||
"Emergency Preparedness": "Scenario Playbooks",
|
||||
"Emergency Management": "Scenario Playbooks",
|
||||
"Wilderness Preparedness": "Scenario Playbooks",
|
||||
"Urban Preparedness": "Scenario Playbooks",
|
||||
"Winter Preparedness": "Scenario Playbooks",
|
||||
# Discard (noise domains)
|
||||
"Humor": None,
|
||||
"Recreation": None,
|
||||
"Business": None,
|
||||
"Finance": None,
|
||||
"Economics": None,
|
||||
"Economics/Finances": None,
|
||||
"Weird Science": None,
|
||||
}
|
||||
|
||||
# Subdomain keyword → canonical domain (for solo-Reference reclassification)
|
||||
SUBDOMAIN_MAP = {
|
||||
"first aid": "Medical",
|
||||
"emergency care": "Medical",
|
||||
"emergency medicine": "Medical",
|
||||
"trauma": "Medical",
|
||||
"anatomy": "Medical",
|
||||
"oral rehydration": "Medical",
|
||||
"ors": "Medical",
|
||||
"pharmacology": "Medical",
|
||||
"toxicology": "Medical",
|
||||
"antidote": "Medical",
|
||||
"nerve agent": "Defense & Tactics",
|
||||
"chemical warfare": "Defense & Tactics",
|
||||
"biological warfare": "Defense & Tactics",
|
||||
"nbc": "Defense & Tactics",
|
||||
"infectious disease": "Medical",
|
||||
"microbiology": "Medical",
|
||||
"virology": "Medical",
|
||||
"bacteriology": "Medical",
|
||||
"pediatric": "Medical",
|
||||
"surgery": "Medical",
|
||||
"wound care": "Medical",
|
||||
"veterinary": "Medical",
|
||||
"dental": "Medical",
|
||||
"dentistry": "Medical",
|
||||
"herbal": "Medical",
|
||||
"medicinal plant": "Medical",
|
||||
"medicinal herb": "Medical",
|
||||
"herbalism": "Medical",
|
||||
"food preservation": "Food Systems",
|
||||
"canning": "Food Systems",
|
||||
"fermentation": "Food Systems",
|
||||
"food storage": "Food Systems",
|
||||
"food safety": "Food Systems",
|
||||
"cooking": "Food Systems",
|
||||
"food processing": "Food Systems",
|
||||
"agriculture": "Sustainment Systems",
|
||||
"soil": "Sustainment Systems",
|
||||
"permaculture": "Sustainment Systems",
|
||||
"agroforestry": "Sustainment Systems",
|
||||
"livestock": "Sustainment Systems",
|
||||
"animal husbandry": "Sustainment Systems",
|
||||
"beekeeping": "Sustainment Systems",
|
||||
"foraging": "Sustainment Systems",
|
||||
"hunting": "Sustainment Systems",
|
||||
"fishing": "Sustainment Systems",
|
||||
"gardening": "Sustainment Systems",
|
||||
"mycology": "Sustainment Systems",
|
||||
"mushroom": "Sustainment Systems",
|
||||
"water purification": "Water Systems",
|
||||
"water filtration": "Water Systems",
|
||||
"water sanitation": "Water Systems",
|
||||
"water disinfection": "Water Systems",
|
||||
"water storage": "Water Systems",
|
||||
"well construction": "Water Systems",
|
||||
"rainwater": "Water Systems",
|
||||
"solar": "Power Systems",
|
||||
"wind turbine": "Power Systems",
|
||||
"battery": "Power Systems",
|
||||
"batteries": "Power Systems",
|
||||
"generator": "Power Systems",
|
||||
"photovoltaic": "Power Systems",
|
||||
"charge controller": "Power Systems",
|
||||
"inverter": "Power Systems",
|
||||
"biogas": "Off-Grid Systems",
|
||||
"biomass": "Off-Grid Systems",
|
||||
"wood gasification": "Off-Grid Systems",
|
||||
"rocket stove": "Off-Grid Systems",
|
||||
"mechanical system": "Off-Grid Systems",
|
||||
"power transmission": "Off-Grid Systems",
|
||||
"radio": "Communications",
|
||||
"ham radio": "Communications",
|
||||
"amateur radio": "Communications",
|
||||
"antenna": "Communications",
|
||||
"meshtastic": "Communications",
|
||||
"encryption": "Communications",
|
||||
"navigation": "Navigation",
|
||||
"celestial navigation": "Navigation",
|
||||
"land navigation": "Navigation",
|
||||
"map reading": "Navigation",
|
||||
"compass": "Navigation",
|
||||
"pottery": "Foundational Skills",
|
||||
"ceramics": "Foundational Skills",
|
||||
"blacksmithing": "Foundational Skills",
|
||||
"woodworking": "Foundational Skills",
|
||||
"leatherwork": "Foundational Skills",
|
||||
"textile": "Foundational Skills",
|
||||
"masonry": "Foundational Skills",
|
||||
"metalworking": "Foundational Skills",
|
||||
"historical technology": "Foundational Skills",
|
||||
"weapons": "Defense & Tactics",
|
||||
"firearms": "Defense & Tactics",
|
||||
"ballistics": "Defense & Tactics",
|
||||
"tactics": "Defense & Tactics",
|
||||
"perimeter": "Security",
|
||||
"surveillance": "Security",
|
||||
"supply chain": "Logistics",
|
||||
"logistics": "Logistics",
|
||||
"leadership": "Leadership",
|
||||
"governance": "Leadership",
|
||||
"community": "Community Coordination",
|
||||
"emergency preparedness": "Scenario Playbooks",
|
||||
"disaster": "Scenario Playbooks",
|
||||
"evacuation": "Scenario Playbooks",
|
||||
}
|
||||
|
||||
|
||||
def remap_domains(domains):
|
||||
"""Remap a list of domain strings — variants to canonical, strip Reference."""
|
||||
result = set()
|
||||
for d in domains:
|
||||
if d == "Reference":
|
||||
continue
|
||||
if d in CANONICAL_DOMAINS:
|
||||
result.add(d)
|
||||
elif d in VARIANT_MAP:
|
||||
mapped = VARIANT_MAP[d]
|
||||
if mapped: # None means discard
|
||||
result.add(mapped)
|
||||
# Unknown non-canonical domains: drop them
|
||||
return list(result)
|
||||
|
||||
|
||||
def classify_by_subdomain(subdomains):
|
||||
"""Try to infer canonical domain(s) from subdomain keyword matching."""
|
||||
found = set()
|
||||
for sd in subdomains:
|
||||
sd_lower = sd.lower().strip()
|
||||
for key, domain in SUBDOMAIN_MAP.items():
|
||||
if key in sd_lower:
|
||||
found.add(domain)
|
||||
return list(found) if found else None
|
||||
|
||||
|
||||
def process_window_file(filepath, dry_run):
|
||||
"""Process one window JSON file (array of concepts). Returns per-file stats."""
|
||||
stats = defaultdict(int)
|
||||
unknowns = []
|
||||
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
except Exception as e:
|
||||
return {"parse_error": 1}, []
|
||||
|
||||
if not isinstance(concepts, list):
|
||||
return {"skip_not_list": 1}, []
|
||||
|
||||
modified = False
|
||||
|
||||
for concept in concepts:
|
||||
if not isinstance(concept, dict):
|
||||
continue
|
||||
|
||||
raw_domains = concept.get("domain", [])
|
||||
if isinstance(raw_domains, str):
|
||||
raw_domains = [raw_domains]
|
||||
|
||||
subdomains = concept.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
|
||||
has_reference = "Reference" in raw_domains
|
||||
non_reference = [d for d in raw_domains if d != "Reference"]
|
||||
|
||||
if not has_reference:
|
||||
# No Reference — just fix any variant names
|
||||
remapped = remap_domains(raw_domains)
|
||||
if set(remapped) != set(raw_domains):
|
||||
concept["domain"] = remapped
|
||||
modified = True
|
||||
stats["variant_remapped"] += 1
|
||||
else:
|
||||
stats["no_change"] += 1
|
||||
continue
|
||||
|
||||
# Has Reference — what else does it have?
|
||||
remapped_others = remap_domains(non_reference)
|
||||
|
||||
if remapped_others:
|
||||
# Reference + real domains: drop Reference, keep the rest
|
||||
concept["domain"] = remapped_others
|
||||
modified = True
|
||||
stats["reference_stripped"] += 1
|
||||
continue
|
||||
|
||||
# Solo Reference (or Reference + only-noise): try subdomain lookup
|
||||
inferred = classify_by_subdomain(subdomains)
|
||||
if inferred:
|
||||
concept["domain"] = inferred
|
||||
concept["_reclassified_from_reference"] = True
|
||||
modified = True
|
||||
stats["subdomain_reclassified"] += 1
|
||||
continue
|
||||
|
||||
# True unknown — needs API re-enrichment
|
||||
unknowns.append({
|
||||
"filepath": str(filepath),
|
||||
"title": concept.get("title", ""),
|
||||
"subdomain": subdomains,
|
||||
"content_preview": str(concept.get("content", concept.get("summary", "")))[:300],
|
||||
})
|
||||
stats["needs_enrichment"] += 1
|
||||
|
||||
if modified and not dry_run:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return dict(stats), unknowns
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Remap RECON concept domains")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Report without writing")
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"[REMAP] Scanning {CONCEPTS_DIR}")
|
||||
print(f"[REMAP] Dry run: {args.dry_run} | Workers: {args.workers}")
|
||||
|
||||
window_files = [
|
||||
f for f in CONCEPTS_DIR.rglob("window_*.json")
|
||||
]
|
||||
print(f"[REMAP] Found {len(window_files):,} window files")
|
||||
|
||||
total_stats = defaultdict(int)
|
||||
all_unknowns = []
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {ex.submit(process_window_file, f, args.dry_run): f for f in window_files}
|
||||
for future in as_completed(futures):
|
||||
file_stats, unknowns = future.result()
|
||||
with lock:
|
||||
for k, v in file_stats.items():
|
||||
total_stats[k] += v
|
||||
all_unknowns.extend(unknowns)
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
print(f" {done:,}/{len(window_files):,} files processed...")
|
||||
|
||||
print("\n── Results ─────────────────────────────────────────────────")
|
||||
for status, count in sorted(total_stats.items(), key=lambda x: -x[1]):
|
||||
print(f" {status:<35} {count:>10,}")
|
||||
|
||||
total_concepts = sum(total_stats.values())
|
||||
print(f"\n Total concepts processed: {total_concepts:>10,}")
|
||||
print(f" True unknowns for re-enrichment:{len(all_unknowns):>10,}")
|
||||
|
||||
if not args.dry_run and all_unknowns:
|
||||
with open(UNKNOWNS_OUTPUT, "w", encoding="utf-8") as f:
|
||||
for item in all_unknowns:
|
||||
f.write(json.dumps(item) + "\n")
|
||||
print(f"\n Unknowns written to: {UNKNOWNS_OUTPUT}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n [DRY RUN] No files were modified.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
469
scripts/migrate_domains.py
Normal file
469
scripts/migrate_domains.py
Normal file
|
|
@ -0,0 +1,469 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
migrate_domains.py — Reclassify 5 legacy domains via Gemini Flash.
|
||||
|
||||
Targets: Sustainment Systems, Off-Grid Systems, Defense & Tactics,
|
||||
Community Coordination, Leadership
|
||||
|
||||
Maps each to one of the 18 approved domains. 16 parallel workers,
|
||||
checkpoint file, crash-safe, incremental saves, progress every 5,000.
|
||||
|
||||
Usage:
|
||||
python3 /tmp/migrate_domains.py [--dry-run] [--workers 16] [--limit N]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import FieldCondition, MatchValue, Filter
|
||||
|
||||
# Suppress noisy HTTP logs
|
||||
import logging as _logging
|
||||
_logging.getLogger("httpx").setLevel(_logging.WARNING)
|
||||
_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/migrate_domains.log")
|
||||
CHECKPOINT_FILE = Path("/opt/recon/data/migrate_domains_checkpoint.json")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("migrate_domains")
|
||||
|
||||
# ── Constants ───────────────────────────────────────────────────────────────
|
||||
|
||||
VALID_DOMAINS = {
|
||||
'Agriculture & Livestock', 'Civil Organization', 'Communications',
|
||||
'Food Systems', 'Foundational Skills', 'Logistics', 'Medical',
|
||||
'Navigation', 'Operations', 'Power Systems', 'Preservation & Storage',
|
||||
'Security', 'Shelter & Construction', 'Technology', 'Tools & Equipment',
|
||||
'Vehicles', 'Water Systems', 'Wilderness Skills',
|
||||
}
|
||||
|
||||
SOURCE_DOMAINS = {
|
||||
'Sustainment Systems', 'Off-Grid Systems', 'Defense & Tactics',
|
||||
'Community Coordination', 'Leadership',
|
||||
}
|
||||
|
||||
DOMAIN_LIST_STR = ', '.join(sorted(VALID_DOMAINS))
|
||||
|
||||
CLASSIFY_PROMPT = """\
|
||||
Classify this knowledge concept into exactly one domain from this list:
|
||||
Agriculture & Livestock, Civil Organization, Communications, Food Systems, Foundational Skills, Logistics, Medical, Navigation, Operations, Power Systems, Preservation & Storage, Security, Shelter & Construction, Technology, Tools & Equipment, Vehicles, Water Systems, Wilderness Skills
|
||||
|
||||
Return ONLY the exact domain string, nothing else. No explanation, no punctuation, no quotes.
|
||||
|
||||
Content: {content}
|
||||
Summary: {summary}
|
||||
Subdomain: {subdomain}
|
||||
"""
|
||||
|
||||
DOMAIN_FALLBACK = 'Foundational Skills'
|
||||
|
||||
# ── Key management ──────────────────────────────────────────────────────────
|
||||
|
||||
def load_gemini_keys():
|
||||
keys = []
|
||||
env_path = Path("/opt/recon/.env")
|
||||
if not env_path.exists():
|
||||
raise FileNotFoundError(f"{env_path} not found")
|
||||
for line in env_path.read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
if not keys:
|
||||
raise ValueError("No GEMINI_KEY_* found in .env")
|
||||
return keys
|
||||
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
|
||||
# ── Classification ──────────────────────────────────────────────────────────
|
||||
|
||||
def classify_domain(content, summary, subdomains, key):
|
||||
"""Call Gemini Flash to classify into one of 18 domains."""
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
content=str(content)[:400] if content else "(none)",
|
||||
summary=str(summary)[:200] if summary else "(none)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "text/plain"}
|
||||
)
|
||||
|
||||
for retry in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
value = resp.text.strip().strip('"').strip("'").strip()
|
||||
if value in VALID_DOMAINS:
|
||||
return value
|
||||
# Try case-insensitive match
|
||||
for valid in VALID_DOMAINS:
|
||||
if value.lower() == valid.lower():
|
||||
return valid
|
||||
# Partial match — Gemini sometimes returns with trailing period
|
||||
clean = value.rstrip('.')
|
||||
if clean in VALID_DOMAINS:
|
||||
return clean
|
||||
# Invalid — retry with stricter prompt
|
||||
if retry < 3:
|
||||
prompt = (
|
||||
f"Your previous response '{value}' was invalid. "
|
||||
f"You must return ONLY one of these exact strings: {DOMAIN_LIST_STR}\n\n"
|
||||
f"Content: {str(content)[:300]}\n"
|
||||
f"Return ONLY the exact domain string."
|
||||
)
|
||||
continue
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
|
||||
else:
|
||||
log.warning(f"Gemini error (attempt {retry+1}): {e}")
|
||||
if retry >= 2:
|
||||
break
|
||||
|
||||
return heuristic_fallback(content, summary, subdomains)
|
||||
|
||||
|
||||
def heuristic_fallback(content, summary, subdomains):
|
||||
"""Last-resort heuristic when Gemini fails or returns invalid."""
|
||||
text = f"{summary or ''} {' '.join(subdomains or [])} {str(content or '')[:200]}".lower()
|
||||
|
||||
mapping = [
|
||||
(["farming", "agriculture", "livestock", "animal husbandry", "poultry",
|
||||
"cattle", "crop", "soil fertility", "irrigation for crops"], "Agriculture & Livestock"),
|
||||
(["foraging", "hunting", "fishing", "bushcraft", "wilderness", "survival skill",
|
||||
"fire starting", "shelter building", "trapping", "tracking"], "Wilderness Skills"),
|
||||
(["food preservation", "canning", "dehydration", "smoking", "pickling",
|
||||
"fermentation", "food storage", "freeze dry"], "Preservation & Storage"),
|
||||
(["cooking", "recipe", "nutrition", "food preparation", "baking",
|
||||
"food production", "meal"], "Food Systems"),
|
||||
(["first aid", "medical", "trauma", "surgery", "anatomy", "pharmacology",
|
||||
"wound", "triage", "diagnosis", "disease", "infection", "veterinary",
|
||||
"herbal medicine", "medicinal plant"], "Medical"),
|
||||
(["radio", "antenna", "ham radio", "communication", "signal",
|
||||
"networking", "meshtastic", "comms"], "Communications"),
|
||||
(["solar", "battery", "generator", "wind turbine", "hydroelectric",
|
||||
"power grid", "inverter", "photovoltaic", "electricity"], "Power Systems"),
|
||||
(["water purification", "water filter", "well", "rainwater",
|
||||
"sanitation", "water treatment", "desalination"], "Water Systems"),
|
||||
(["navigation", "compass", "map reading", "gps", "celestial",
|
||||
"orienteering", "land nav"], "Navigation"),
|
||||
(["security", "opsec", "perimeter", "surveillance", "threat",
|
||||
"intrusion detection", "physical security"], "Security"),
|
||||
(["vehicle", "engine", "motor", "aircraft", "boat", "motorcycle",
|
||||
"truck", "maintenance", "diesel", "transmission"], "Vehicles"),
|
||||
(["tool", "equipment", "wrench", "saw", "drill", "hammer",
|
||||
"hand tool", "power tool", "blade", "sharpening"], "Tools & Equipment"),
|
||||
(["construction", "building", "shelter", "carpentry", "masonry",
|
||||
"roofing", "concrete", "framing", "plumbing"], "Shelter & Construction"),
|
||||
(["electronics", "computer", "software", "circuit", "programming",
|
||||
"technology", "digital", "engineering"], "Technology"),
|
||||
(["supply chain", "logistics", "transport", "distribution",
|
||||
"inventory", "supply", "stockpile"], "Logistics"),
|
||||
(["governance", "civil", "community", "administration", "organization",
|
||||
"council", "democratic", "municipal"], "Civil Organization"),
|
||||
(["tactics", "combat", "military", "mission", "patrol", "ambush",
|
||||
"defensive position", "fire team", "maneuver", "engagement",
|
||||
"search and rescue", "sar", "reconnaissance"], "Operations"),
|
||||
]
|
||||
|
||||
for keywords, domain in mapping:
|
||||
if any(kw in text for kw in keywords):
|
||||
return domain
|
||||
|
||||
return DOMAIN_FALLBACK
|
||||
|
||||
|
||||
# ── Checkpoint ──────────────────────────────────────────────────────────────
|
||||
|
||||
class Checkpoint:
|
||||
"""Thread-safe checkpoint tracker for crash recovery."""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self._lock = threading.Lock()
|
||||
self._completed = set()
|
||||
self._dirty = 0
|
||||
self._load()
|
||||
|
||||
def _load(self):
|
||||
if self.path.exists():
|
||||
try:
|
||||
data = json.loads(self.path.read_text())
|
||||
self._completed = set(data.get("completed", []))
|
||||
log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
|
||||
except Exception:
|
||||
self._completed = set()
|
||||
|
||||
def is_done(self, point_id):
|
||||
return point_id in self._completed
|
||||
|
||||
def mark_done(self, point_id):
|
||||
with self._lock:
|
||||
self._completed.add(point_id)
|
||||
self._dirty += 1
|
||||
if self._dirty >= 1000:
|
||||
self._flush()
|
||||
|
||||
def _flush(self):
|
||||
tmp = self.path.with_suffix('.tmp')
|
||||
tmp.write_text(json.dumps({"completed": list(self._completed)}))
|
||||
tmp.rename(self.path)
|
||||
self._dirty = 0
|
||||
|
||||
def flush(self):
|
||||
with self._lock:
|
||||
self._flush()
|
||||
|
||||
def count(self):
|
||||
return len(self._completed)
|
||||
|
||||
|
||||
# ── Per-point processing ───────────────────────────────────────────────────
|
||||
|
||||
def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run, stats):
|
||||
point_id = point.id
|
||||
if checkpoint.is_done(point_id):
|
||||
return "skipped"
|
||||
|
||||
payload = point.payload
|
||||
content = payload.get("content", payload.get("summary", ""))
|
||||
summary = payload.get("summary", "")
|
||||
subdomains = payload.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
old_domain = payload.get("domain", [])
|
||||
if isinstance(old_domain, list):
|
||||
old_domain_str = old_domain[0] if old_domain else "(empty)"
|
||||
else:
|
||||
old_domain_str = str(old_domain)
|
||||
|
||||
key = key_rotator.next()
|
||||
new_domain = classify_domain(content, summary, subdomains, key)
|
||||
|
||||
# Track the mapping
|
||||
stats_key = f"{old_domain_str} -> {new_domain}"
|
||||
stats[stats_key] = stats.get(stats_key, 0) + 1
|
||||
|
||||
if dry_run:
|
||||
return f"would: {old_domain_str} -> {new_domain}"
|
||||
|
||||
# Write new domain as single string
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"domain": new_domain},
|
||||
points=[point_id],
|
||||
)
|
||||
|
||||
checkpoint.mark_done(point_id)
|
||||
return "ok"
|
||||
|
||||
|
||||
# ── Main loop ───────────────────────────────────────────────────────────────
|
||||
|
||||
SCROLL_BATCH = 5000
|
||||
|
||||
|
||||
def count_source_domains(qdrant, collection):
|
||||
"""Count vectors with source domains."""
|
||||
counts = {}
|
||||
for domain in SOURCE_DOMAINS:
|
||||
result = qdrant.count(
|
||||
collection_name=collection,
|
||||
count_filter=Filter(
|
||||
must=[FieldCondition(key="domain", match=MatchValue(value=domain))]
|
||||
),
|
||||
exact=True,
|
||||
)
|
||||
counts[domain] = result.count
|
||||
return counts
|
||||
|
||||
|
||||
def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None, dry_run=False):
|
||||
"""Scroll source domains in batches, process with thread pool."""
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
skipped_checkpoint = 0
|
||||
start = time.time()
|
||||
stats = {} # shared mapping stats
|
||||
|
||||
for source_domain in sorted(SOURCE_DOMAINS):
|
||||
log.info(f"\n--- Processing domain: {source_domain} ---")
|
||||
offset = None
|
||||
domain_done = 0
|
||||
|
||||
while True:
|
||||
scroll_results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
limit=SCROLL_BATCH,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
|
||||
),
|
||||
)
|
||||
|
||||
if not scroll_results:
|
||||
if offset is None:
|
||||
break
|
||||
continue
|
||||
|
||||
# Filter already checkpointed
|
||||
pending = [p for p in scroll_results if not checkpoint.is_done(p.id)]
|
||||
skipped_checkpoint += len(scroll_results) - len(pending)
|
||||
|
||||
if pending:
|
||||
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||
futures = {
|
||||
ex.submit(process_point, p, qdrant, collection, rotator,
|
||||
checkpoint, dry_run, stats): p
|
||||
for p in pending
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
log.error(f"Worker error: {e}")
|
||||
with lock:
|
||||
done += 1
|
||||
domain_done += 1
|
||||
if done % 5000 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = done / elapsed * 60
|
||||
log.info(f" {done:,} done | {rate:.0f}/min | "
|
||||
f"elapsed {elapsed/60:.1f}min")
|
||||
checkpoint.flush()
|
||||
time.sleep(0.02)
|
||||
|
||||
if limit and done >= limit:
|
||||
break
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
log.info(f" {source_domain}: {domain_done:,} vectors processed")
|
||||
|
||||
if limit and done >= limit:
|
||||
break
|
||||
|
||||
checkpoint.flush()
|
||||
return done, skipped_checkpoint, stats, start
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Classify 20 samples without writing")
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
qdrant = QdrantClient(host="localhost", port=6333, timeout=120)
|
||||
collection = "recon_knowledge"
|
||||
checkpoint = Checkpoint(CHECKPOINT_FILE)
|
||||
|
||||
# Count source domains
|
||||
counts = count_source_domains(qdrant, collection)
|
||||
total_source = sum(counts.values())
|
||||
pre_checkpoint = checkpoint.count()
|
||||
|
||||
log.info(f"Source domain counts:")
|
||||
for domain, count in sorted(counts.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {domain:30s} {count:>10,}")
|
||||
log.info(f" {'TOTAL':30s} {total_source:>10,}")
|
||||
log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
|
||||
log.info(f"Workers: {args.workers} | Keys: {len(keys)}")
|
||||
|
||||
# Cost estimate
|
||||
remaining = total_source - pre_checkpoint
|
||||
input_tokens = remaining * 200
|
||||
output_tokens = remaining * 5
|
||||
input_cost = input_tokens / 1_000_000 * 0.10
|
||||
output_cost = output_tokens / 1_000_000 * 0.40
|
||||
total_cost = input_cost + output_cost
|
||||
log.info(f"\nEstimated Gemini 2.0 Flash cost:")
|
||||
log.info(f" Vectors to process: {remaining:,}")
|
||||
log.info(f" Input: ~{input_tokens/1_000_000:.1f}M tokens = ${input_cost:.2f}")
|
||||
log.info(f" Output: ~{output_tokens/1_000_000:.1f}M tokens = ${output_cost:.2f}")
|
||||
log.info(f" TOTAL: ~${total_cost:.2f}")
|
||||
|
||||
if args.dry_run:
|
||||
log.info(f"\nDRY RUN: classifying 20 samples...\n")
|
||||
for source_domain in sorted(SOURCE_DOMAINS):
|
||||
scroll_results, _ = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
limit=5,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(key="domain", match=MatchValue(value=source_domain))]
|
||||
),
|
||||
)
|
||||
for p in scroll_results[:4]:
|
||||
pay = p.payload
|
||||
title = pay.get("title", "(no title)")
|
||||
content = pay.get("content", pay.get("summary", ""))
|
||||
summary = pay.get("summary", "")
|
||||
subdomains = pay.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
|
||||
key = rotator.next()
|
||||
new_domain = classify_domain(content, summary, subdomains, key)
|
||||
|
||||
old = pay.get("domain", [])
|
||||
if isinstance(old, list):
|
||||
old = old[0] if old else "?"
|
||||
print(f" [{old:25s}] -> [{new_domain:25s}] {title[:60]}")
|
||||
|
||||
print(f"\nDRY RUN complete. ~{remaining:,} vectors would be migrated.")
|
||||
print(f"Estimated cost: ~${total_cost:.2f}")
|
||||
return
|
||||
|
||||
# ── Full migration ──────────────────────────────────────────────────
|
||||
log.info(f"\nStarting full migration...")
|
||||
|
||||
done, skipped_ckpt, stats, start = stream_and_process(
|
||||
qdrant, collection, rotator, checkpoint, args.workers, args.limit
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
log.info(f"\n{'='*70}")
|
||||
log.info(f"MIGRATION COMPLETE in {elapsed/60:.1f}min:")
|
||||
log.info(f" Processed: {done:,}")
|
||||
log.info(f" Skipped (checkpoint): {skipped_ckpt:,}")
|
||||
log.info(f" Rate: {done/elapsed*60:.0f}/min")
|
||||
log.info(f"\nMapping distribution:")
|
||||
for mapping, count in sorted(stats.items(), key=lambda x: -x[1])[:30]:
|
||||
log.info(f" {mapping:<55s} {count:>8,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
469
scripts/migrate_skill_level.py
Executable file
469
scripts/migrate_skill_level.py
Executable file
|
|
@ -0,0 +1,469 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
migrate_skill_level.py — Replaces skill_level with knowledge_type + complexity
|
||||
on all vectors in Qdrant and on-disk concept JSONs.
|
||||
|
||||
Scrolls entire collection, classifies each concept via Gemini Flash,
|
||||
writes knowledge_type + complexity, deletes skill_level.
|
||||
|
||||
Crash-safe: completed point IDs tracked in checkpoint file.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/migrate_skill_level.py [--dry-run] [--workers 16] [--limit N]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import FieldCondition, MatchValue, Filter
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/recon')
|
||||
from lib.utils import get_config, setup_logging
|
||||
|
||||
# Suppress noisy HTTP request logging from qdrant_client/httpx
|
||||
import logging as _logging
|
||||
_logging.getLogger("httpx").setLevel(_logging.WARNING)
|
||||
_logging.getLogger("qdrant_client").setLevel(_logging.WARNING)
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/migrate_skill_level.log")
|
||||
CHECKPOINT_FILE = Path("/opt/recon/data/migrate_skill_level_checkpoint.json")
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("migrate_skill_level")
|
||||
|
||||
# ── Prompt ──────────────────────────────────────────────────────────────────
|
||||
|
||||
CLASSIFY_PROMPT = """\
|
||||
You are a knowledge classification engine. Given a concept, assign two fields:
|
||||
|
||||
knowledge_type — what KIND of knowledge this is:
|
||||
foundational — concepts, definitions, theory, background knowledge, explanations of how things work
|
||||
procedural — step-by-step techniques, instructions, how-to skills, methods you execute
|
||||
operational — application under real conditions, decision-making, mission execution, judgment calls in context
|
||||
|
||||
complexity — how much prior knowledge is needed:
|
||||
basic — requires little or no prior knowledge, introductory material, simple concepts
|
||||
intermediate — requires some domain familiarity, assumes foundational knowledge is in place
|
||||
advanced — requires significant experience or expertise, high-stakes or highly technical material
|
||||
|
||||
EXAMPLES:
|
||||
- "Needle chest decompression procedure" → procedural, advanced
|
||||
- "What is soil texture and why does it matter" → foundational, basic
|
||||
- "Coordinating a fire team withdrawal under contact" → operational, advanced
|
||||
- "How to start a campfire with a ferro rod" → procedural, basic
|
||||
- "Antenna gain and radiation patterns explained" → foundational, intermediate
|
||||
- "Triage decision-making in a mass casualty event" → operational, advanced
|
||||
- "Step-by-step: building a Dakota fire hole" → procedural, intermediate
|
||||
- "Understanding the water cycle" → foundational, basic
|
||||
|
||||
Concept title: {title}
|
||||
Concept domain: {domain}
|
||||
Concept subdomain: {subdomain}
|
||||
Concept content: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown, no explanation:
|
||||
{{"knowledge_type": "foundational|procedural|operational", "complexity": "basic|intermediate|advanced"}}
|
||||
"""
|
||||
|
||||
VALID_KNOWLEDGE_TYPES = {"foundational", "procedural", "operational"}
|
||||
VALID_COMPLEXITIES = {"basic", "intermediate", "advanced"}
|
||||
|
||||
# ── Key management ──────────────────────────────────────────────────────────
|
||||
|
||||
def load_gemini_keys():
|
||||
keys = []
|
||||
for line in Path("/opt/recon/.env").read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
# ── Classification ──────────────────────────────────────────────────────────
|
||||
|
||||
def classify(title, domains, subdomains, content, key):
|
||||
"""Call Gemini Flash to classify knowledge_type + complexity."""
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
title=title or "(untitled)",
|
||||
domain=", ".join(domains[:5]) if domains else "(none)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=str(content)[:400] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for retry in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
kt = data.get("knowledge_type", "").lower().strip()
|
||||
cx = data.get("complexity", "").lower().strip()
|
||||
if kt in VALID_KNOWLEDGE_TYPES and cx in VALID_COMPLEXITIES:
|
||||
return kt, cx
|
||||
# Invalid values — retry once
|
||||
if retry == 0:
|
||||
continue
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
|
||||
else:
|
||||
break
|
||||
|
||||
# Fallback heuristic based on old skill_level + content analysis
|
||||
return heuristic_fallback(title, subdomains, content)
|
||||
|
||||
|
||||
def heuristic_fallback(title, subdomains, content):
|
||||
"""Last-resort heuristic when Gemini fails."""
|
||||
text = f"{title} {' '.join(subdomains)} {str(content)[:200]}".lower()
|
||||
|
||||
# Knowledge type heuristic
|
||||
procedural_signals = ["how to", "step-by-step", "procedure", "instructions",
|
||||
"method", "technique", "build", "make", "construct",
|
||||
"install", "assemble", "recipe", "prepare"]
|
||||
operational_signals = ["decision", "coordinate", "execute", "deploy",
|
||||
"mission", "triage", "under fire", "in the field",
|
||||
"real-world", "scenario", "assessment", "plan"]
|
||||
|
||||
if any(s in text for s in operational_signals):
|
||||
kt = "operational"
|
||||
elif any(s in text for s in procedural_signals):
|
||||
kt = "procedural"
|
||||
else:
|
||||
kt = "foundational"
|
||||
|
||||
# Complexity heuristic — default intermediate (safest middle ground)
|
||||
cx = "intermediate"
|
||||
basic_signals = ["introduction", "what is", "basic", "beginner", "overview",
|
||||
"definition", "simple", "fundamentals"]
|
||||
advanced_signals = ["advanced", "expert", "complex", "critical", "high-stakes",
|
||||
"surgery", "trauma", "tactical", "classified"]
|
||||
if any(s in text for s in basic_signals):
|
||||
cx = "basic"
|
||||
elif any(s in text for s in advanced_signals):
|
||||
cx = "advanced"
|
||||
|
||||
return kt, cx
|
||||
|
||||
# ── Checkpoint management ───────────────────────────────────────────────────
|
||||
|
||||
class Checkpoint:
|
||||
"""Thread-safe checkpoint tracker for crash recovery."""
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self._lock = threading.Lock()
|
||||
self._completed = set()
|
||||
self._dirty = 0
|
||||
self._load()
|
||||
|
||||
def _load(self):
|
||||
if self.path.exists():
|
||||
try:
|
||||
data = json.loads(self.path.read_text())
|
||||
self._completed = set(data.get("completed", []))
|
||||
log.info(f"Loaded checkpoint: {len(self._completed):,} completed points")
|
||||
except Exception:
|
||||
self._completed = set()
|
||||
|
||||
def is_done(self, point_id):
|
||||
return point_id in self._completed
|
||||
|
||||
def mark_done(self, point_id):
|
||||
with self._lock:
|
||||
self._completed.add(point_id)
|
||||
self._dirty += 1
|
||||
if self._dirty >= 1000:
|
||||
self._flush()
|
||||
|
||||
def _flush(self):
|
||||
tmp = self.path.with_suffix('.tmp')
|
||||
tmp.write_text(json.dumps({"completed": list(self._completed)}))
|
||||
tmp.rename(self.path)
|
||||
self._dirty = 0
|
||||
|
||||
def flush(self):
|
||||
with self._lock:
|
||||
self._flush()
|
||||
|
||||
def count(self):
|
||||
return len(self._completed)
|
||||
|
||||
# ── Concept JSON update ────────────────────────────────────────────────────
|
||||
|
||||
def update_concept_json(doc_hash, title, knowledge_type, complexity):
|
||||
"""Update on-disk concept JSON: add knowledge_type + complexity, remove skill_level."""
|
||||
doc_dir = CONCEPTS_DIR / doc_hash
|
||||
if not doc_dir.exists():
|
||||
return False
|
||||
for wf in doc_dir.glob("window_*.json"):
|
||||
try:
|
||||
with open(wf, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
changed = False
|
||||
for c in concepts:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
if c.get("title") == title:
|
||||
c["knowledge_type"] = knowledge_type
|
||||
c["complexity"] = complexity
|
||||
c.pop("skill_level", None)
|
||||
changed = True
|
||||
if changed:
|
||||
with open(wf, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
# ── Per-point processing ───────────────────────────────────────────────────
|
||||
|
||||
def process_point(point, qdrant, collection, key_rotator, checkpoint, dry_run):
|
||||
point_id = point.id
|
||||
if checkpoint.is_done(point_id):
|
||||
return "skipped"
|
||||
|
||||
payload = point.payload
|
||||
title = payload.get("title", "")
|
||||
domains = payload.get("domain", [])
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
subdomains = payload.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
content = payload.get("content", payload.get("summary", ""))
|
||||
doc_hash = payload.get("doc_hash", "")
|
||||
|
||||
key = key_rotator.next()
|
||||
knowledge_type, complexity = classify(title, domains, subdomains, content, key)
|
||||
|
||||
if dry_run:
|
||||
return f"kt={knowledge_type}, cx={complexity}"
|
||||
|
||||
# Write new fields
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"knowledge_type": knowledge_type, "complexity": complexity},
|
||||
points=[point_id],
|
||||
)
|
||||
|
||||
# Delete old field
|
||||
qdrant.delete_payload(
|
||||
collection_name=collection,
|
||||
keys=["skill_level"],
|
||||
points=[point_id],
|
||||
)
|
||||
|
||||
# Update JSON on disk
|
||||
if doc_hash:
|
||||
update_concept_json(doc_hash, title, knowledge_type, complexity)
|
||||
|
||||
checkpoint.mark_done(point_id)
|
||||
return "ok"
|
||||
|
||||
# ── Streaming batch processor ───────────────────────────────────────────────
|
||||
|
||||
SCROLL_BATCH = 5000 # vectors per scroll batch — keeps memory bounded (~50MB)
|
||||
|
||||
|
||||
def count_collection(qdrant, collection):
|
||||
"""Quick count of total vectors via collection info."""
|
||||
info = qdrant.get_collection(collection)
|
||||
return info.points_count
|
||||
|
||||
|
||||
def stream_and_process(qdrant, collection, rotator, checkpoint, workers, limit=None):
|
||||
"""Scroll in batches, process each batch with thread pool, then discard.
|
||||
|
||||
Memory-bounded: only holds SCROLL_BATCH payloads at any time (~50MB).
|
||||
"""
|
||||
results_agg = defaultdict(int)
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
skipped_checkpoint = 0
|
||||
skipped_no_skill = 0
|
||||
total_estimate = count_collection(qdrant, collection)
|
||||
start = time.time()
|
||||
|
||||
offset = None
|
||||
batch_num = 0
|
||||
|
||||
while True:
|
||||
batch_num += 1
|
||||
scroll_results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
limit=SCROLL_BATCH,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
# Filter to points needing migration
|
||||
pending = []
|
||||
for p in scroll_results:
|
||||
if "skill_level" not in p.payload:
|
||||
skipped_no_skill += 1
|
||||
continue
|
||||
if checkpoint.is_done(p.id):
|
||||
skipped_checkpoint += 1
|
||||
continue
|
||||
pending.append(p)
|
||||
|
||||
if pending:
|
||||
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||
futures = {
|
||||
ex.submit(process_point, p, qdrant, collection, rotator, checkpoint, False): p
|
||||
for p in pending
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
status = future.result()
|
||||
except Exception as e:
|
||||
status = f"error: {str(e)[:80]}"
|
||||
log.error(f"Worker error: {e}")
|
||||
with lock:
|
||||
results_agg[status] += 1
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = done / elapsed * 60
|
||||
remaining = total_estimate - done - skipped_checkpoint - skipped_no_skill
|
||||
eta = remaining / (done / elapsed) / 60 if done > 0 else 0
|
||||
log.info(f" {done:,} done | {rate:.0f}/min | "
|
||||
f"ETA ~{eta:.0f}min | {dict(results_agg)}")
|
||||
checkpoint.flush()
|
||||
time.sleep(0.02)
|
||||
|
||||
if limit and done >= limit:
|
||||
break
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
checkpoint.flush()
|
||||
return done, skipped_checkpoint, skipped_no_skill, results_agg, start
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Classify 20 samples without writing anything")
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = get_config()
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=120
|
||||
)
|
||||
collection = config['vector_db']['collection']
|
||||
checkpoint = Checkpoint(CHECKPOINT_FILE)
|
||||
|
||||
total_vectors = count_collection(qdrant, collection)
|
||||
pre_checkpoint = checkpoint.count()
|
||||
|
||||
log.info(f"Collection has {total_vectors:,} vectors")
|
||||
log.info(f"Checkpoint: {pre_checkpoint:,} already completed")
|
||||
log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
|
||||
log.info(f"Estimated Gemini Flash cost: ~${(total_vectors - pre_checkpoint) * 0.0004:.2f}")
|
||||
log.info(f"Streaming in batches of {SCROLL_BATCH:,} (memory-bounded)")
|
||||
|
||||
if args.dry_run:
|
||||
# Scroll one batch, classify 20 diverse samples
|
||||
log.info(f"\nDRY RUN: classifying 20 samples...\n")
|
||||
scroll_results, _ = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
limit=200,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
samples = []
|
||||
seen_domains = set()
|
||||
for p in scroll_results:
|
||||
if "skill_level" not in p.payload:
|
||||
continue
|
||||
domains = p.payload.get("domain", [])
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
d_key = tuple(sorted(domains[:2]))
|
||||
if d_key not in seen_domains:
|
||||
samples.append(p)
|
||||
seen_domains.add(d_key)
|
||||
if len(samples) >= 20:
|
||||
break
|
||||
|
||||
for i, p in enumerate(samples, 1):
|
||||
pay = p.payload
|
||||
title = pay.get("title", "(no title)")
|
||||
domains = pay.get("domain", [])
|
||||
old_skill = pay.get("skill_level", "?")
|
||||
subdomains = pay.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
content = pay.get("content", pay.get("summary", ""))
|
||||
|
||||
key = rotator.next()
|
||||
kt, cx = classify(title, domains, subdomains, content, key)
|
||||
|
||||
print(f"\n--- Sample {i}/{len(samples)} ---")
|
||||
print(f" Title: {title}")
|
||||
print(f" Domain: {domains}")
|
||||
print(f" Old skill: {old_skill}")
|
||||
print(f" → knowledge_type: {kt}")
|
||||
print(f" → complexity: {cx}")
|
||||
est = total_vectors - pre_checkpoint
|
||||
print(f"\nDRY RUN complete. ~{est:,} vectors would be migrated.")
|
||||
print(f"Estimated Gemini Flash cost: ~${est * 0.0004:.2f}")
|
||||
return
|
||||
|
||||
# ── Full migration run (streaming) ──────────────────────────────────────
|
||||
done, skipped_ckpt, skipped_no_skill, results, start = stream_and_process(
|
||||
qdrant, collection, rotator, checkpoint, args.workers, args.limit
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
log.info(f"\nComplete in {elapsed/60:.1f}min:")
|
||||
log.info(f" Processed: {done:,}")
|
||||
log.info(f" Skipped (checkpoint): {skipped_ckpt:,}")
|
||||
log.info(f" Skipped (no skill): {skipped_no_skill:,}")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<30} {count:>10,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
227
scripts/rebuild_qdrant.py
Executable file
227
scripts/rebuild_qdrant.py
Executable file
|
|
@ -0,0 +1,227 @@
|
|||
"""
|
||||
RECON Qdrant Rebuilder — patched for headless parallel execution
|
||||
|
||||
Deletes and recreates the Qdrant collection, then re-embeds ALL concept JSONs
|
||||
from disk using parallel workers. Pass --confirm to skip interactive prompt.
|
||||
|
||||
Usage:
|
||||
python3 scripts/rebuild_qdrant.py --confirm [--workers 8]
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import requests as http_requests
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import VectorParams, Distance, PointStruct
|
||||
|
||||
from lib.utils import get_config, concept_id, setup_logging
|
||||
from lib.status import StatusDB
|
||||
|
||||
logger = setup_logging('recon.rebuild')
|
||||
|
||||
|
||||
def embed_content(config, content):
|
||||
try:
|
||||
tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/embed"
|
||||
resp = http_requests.post(tei_url, json={"inputs": content}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
return resp.json()[0]
|
||||
except Exception as tei_err:
|
||||
logger.debug(f"TEI failed, trying Ollama: {tei_err}")
|
||||
|
||||
ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/embed"
|
||||
resp = http_requests.post(ollama_url, json={
|
||||
"model": config['embedding']['model'],
|
||||
"input": content
|
||||
}, timeout=120)
|
||||
resp.raise_for_status()
|
||||
return resp.json()['embeddings'][0]
|
||||
|
||||
|
||||
def process_doc(doc_hash, config, db, qdrant, collection):
|
||||
"""Embed and upsert all concepts for a single document. Returns (inserted, failed)."""
|
||||
doc_dir = os.path.join(config['paths']['concepts'], doc_hash)
|
||||
doc = db.get_document(doc_hash)
|
||||
filename = doc['filename'] if doc else doc_hash[:8]
|
||||
|
||||
window_files = sorted([
|
||||
f for f in os.listdir(doc_dir)
|
||||
if f.startswith('window_') and f.endswith('.json')
|
||||
])
|
||||
|
||||
all_concepts = []
|
||||
for wf in window_files:
|
||||
path = os.path.join(doc_dir, wf)
|
||||
try:
|
||||
with open(path, encoding='utf-8') as f:
|
||||
concepts = json.load(f)
|
||||
if isinstance(concepts, list):
|
||||
all_concepts.extend(concepts)
|
||||
except Exception as e:
|
||||
logger.warning(f"Skipping corrupted window {wf} in {doc_hash}: {e}")
|
||||
|
||||
if not all_concepts:
|
||||
return 0, 0
|
||||
|
||||
is_web = doc.get('path', '').startswith(('http://', 'https://')) if doc else False
|
||||
|
||||
# Check meta.json for explicit source_type (e.g. 'transcript')
|
||||
source_type = 'web' if is_web else 'document'
|
||||
text_dir = os.path.join(config['paths']['text'], doc_hash)
|
||||
meta_path = os.path.join(text_dir, 'meta.json')
|
||||
if os.path.exists(meta_path):
|
||||
try:
|
||||
with open(meta_path) as mf:
|
||||
meta = json.load(mf)
|
||||
if meta.get('source_type'):
|
||||
source_type = meta['source_type']
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
points = []
|
||||
failed = 0
|
||||
batch_size = config['processing']['embed_batch_size']
|
||||
|
||||
for idx, concept in enumerate(all_concepts):
|
||||
content = concept.get('content', '')
|
||||
if not content or len(content.strip()) < 10:
|
||||
continue
|
||||
try:
|
||||
vector = embed_content(config, content)
|
||||
except Exception as e:
|
||||
logger.warning(f"Embedding failed {doc_hash}:{idx}: {e}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
start_page = concept.get('_start_page', 0)
|
||||
point_id = concept_id(doc_hash, start_page, idx)
|
||||
|
||||
payload = {
|
||||
'doc_hash': doc_hash,
|
||||
'filename': filename,
|
||||
'book_title': doc.get('book_title', '') if doc else '',
|
||||
'book_author': doc.get('book_author', '') if doc else '',
|
||||
'source_type': source_type,
|
||||
'verification_status': 'unverified',
|
||||
'credibility_score': 0.7,
|
||||
'language': 'en',
|
||||
}
|
||||
for field in ['content', 'summary', 'title', 'domain', 'subdomain',
|
||||
'keywords', 'skill_level', 'key_facts', 'scenario_applicable',
|
||||
'cross_domain_tags', 'chapter', 'page_ref', 'notes',
|
||||
'_window', '_start_page']:
|
||||
if field in concept:
|
||||
payload[field] = concept[field]
|
||||
|
||||
points.append(PointStruct(id=point_id, vector=vector, payload=payload))
|
||||
|
||||
if len(points) >= batch_size:
|
||||
qdrant.upsert(collection_name=collection, points=points)
|
||||
points = []
|
||||
|
||||
if points:
|
||||
qdrant.upsert(collection_name=collection, points=points)
|
||||
|
||||
inserted = len(all_concepts) - failed
|
||||
if doc:
|
||||
db.update_status(doc_hash, 'complete', vectors_inserted=inserted)
|
||||
|
||||
return inserted, failed
|
||||
|
||||
|
||||
def run_rebuild(workers=8):
|
||||
config = get_config()
|
||||
db = StatusDB()
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
collection = config['vector_db']['collection']
|
||||
|
||||
# Delete and recreate
|
||||
try:
|
||||
qdrant.delete_collection(collection)
|
||||
logger.info(f"Deleted collection: {collection}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
qdrant.create_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=VectorParams(
|
||||
size=config['embedding']['dimensions'],
|
||||
distance=Distance.COSINE
|
||||
)
|
||||
)
|
||||
logger.info(f"Created collection: {collection} ({config['embedding']['dimensions']}d, Cosine)")
|
||||
|
||||
concepts_root = config['paths']['concepts']
|
||||
doc_dirs = sorted([
|
||||
d for d in os.listdir(concepts_root)
|
||||
if os.path.isdir(os.path.join(concepts_root, d))
|
||||
])
|
||||
logger.info(f"Found {len(doc_dirs)} document concept directories | {workers} workers")
|
||||
|
||||
total_inserted = 0
|
||||
total_failed = 0
|
||||
done = 0
|
||||
lock = threading.Lock()
|
||||
start = time.time()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as ex:
|
||||
futures = {
|
||||
ex.submit(process_doc, h, config, StatusDB(), qdrant, collection): h
|
||||
for h in doc_dirs
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
doc_hash = futures[future]
|
||||
try:
|
||||
inserted, failed = future.result()
|
||||
except Exception as e:
|
||||
logger.error(f"Worker error {doc_hash}: {e}")
|
||||
inserted, failed = 0, 0
|
||||
|
||||
with lock:
|
||||
total_inserted += inserted
|
||||
total_failed += failed
|
||||
done += 1
|
||||
if done % 500 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = total_inserted / elapsed if elapsed > 0 else 0
|
||||
remaining = (len(doc_dirs) - done) / (done / elapsed) if elapsed > 0 else 0
|
||||
logger.info(
|
||||
f" [{done}/{len(doc_dirs)}] "
|
||||
f"{total_inserted:,} vectors | "
|
||||
f"{rate:.0f}/sec | "
|
||||
f"ETA {remaining/60:.0f}min"
|
||||
)
|
||||
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"\nRebuild complete in {elapsed/60:.1f} min: "
|
||||
f"{total_inserted:,} inserted, {total_failed:,} failed")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--confirm', action='store_true', help='Skip interactive prompt')
|
||||
parser.add_argument('--workers', type=int, default=8)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.confirm:
|
||||
print("WARNING: This will DELETE and RECREATE the Qdrant collection.")
|
||||
confirm = input("Type 'REBUILD' to proceed: ")
|
||||
if confirm != 'REBUILD':
|
||||
print("Aborted.")
|
||||
sys.exit(0)
|
||||
|
||||
run_rebuild(workers=args.workers)
|
||||
314
scripts/reenrich_reference.py
Executable file
314
scripts/reenrich_reference.py
Executable file
|
|
@ -0,0 +1,314 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
reenrich_reference.py — Re-classifies all remaining Reference-tagged concepts.
|
||||
|
||||
Scrolls Qdrant for vectors with domain == ["Reference"] or containing "Reference",
|
||||
calls Gemini with a hardened prompt that rejects Reference as a valid response,
|
||||
updates both Qdrant payload and concept JSON on disk.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/reenrich_reference.py [--dry-run] [--workers 16] [--limit N]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import FieldCondition, MatchAny, Filter
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/recon')
|
||||
from lib.utils import get_config, setup_logging
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/reenrich_reference.log")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("reenrich_reference")
|
||||
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
|
||||
CANONICAL_DOMAINS = {
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
}
|
||||
|
||||
# Hardened prompt — Reference explicitly forbidden, classification rules detailed
|
||||
CLASSIFY_PROMPT = """\
|
||||
You are a knowledge classification engine. Classify this concept into its correct domain.
|
||||
|
||||
VALID DOMAINS — use ONLY these exact strings:
|
||||
Defense & Tactics
|
||||
Sustainment Systems
|
||||
Off-Grid Systems
|
||||
Foundational Skills
|
||||
Communications
|
||||
Medical
|
||||
Food Systems
|
||||
Navigation
|
||||
Logistics
|
||||
Power Systems
|
||||
Leadership
|
||||
Scenario Playbooks
|
||||
Water Systems
|
||||
Security
|
||||
Community Coordination
|
||||
|
||||
FORBIDDEN: Do NOT output "Reference" under any circumstances. It is not a valid domain.
|
||||
FORBIDDEN: Do NOT output an empty domain list.
|
||||
|
||||
CLASSIFICATION RULES:
|
||||
- First aid, anatomy, pharmacology, herbs, veterinary, austere medicine, wound care → Medical
|
||||
- Food growing, foraging, hunting, fishing, animal husbandry, livestock → Sustainment Systems
|
||||
- Food preservation, canning, fermentation, food storage, dehydrating → Food Systems
|
||||
- Solar, wind, hydro, batteries, generators, inverters, charge controllers → Power Systems
|
||||
- Water sourcing, filtration, purification, sanitation, wells, rainwater → Water Systems
|
||||
- Radio, antennas, mesh networking, SIGINT, amateur radio → Communications
|
||||
- Weapons, tactics, NBC, security operations, field craft → Defense & Tactics
|
||||
- Permaculture, soil science, agroforestry, composting → Sustainment Systems
|
||||
- Shelter, construction, masonry, blacksmithing, woodworking, crafts → Foundational Skills
|
||||
- Navigation, land nav, celestial nav, map reading, compass → Navigation
|
||||
- Emergency planning, disaster prep, scenario planning → Scenario Playbooks
|
||||
- Leadership, governance, community organization → Leadership
|
||||
- Supply chain, transportation, inventory → Logistics
|
||||
- Physical security, perimeter, surveillance → Security
|
||||
- Community building, cooperation, mutual aid → Community Coordination
|
||||
- Biogas, wood gasification, rocket stoves, appropriate technology → Off-Grid Systems
|
||||
|
||||
If uncertain between two domains, pick the most actionable one for a self-reliant household.
|
||||
|
||||
Concept title: {title}
|
||||
Concept subdomain tags: {subdomain}
|
||||
Concept content: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown, no explanation:
|
||||
{{"domain": ["Domain Name"]}}
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
keys = []
|
||||
for line in Path("/opt/recon/.env").read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def classify(title, subdomains, content, key, attempt=0):
|
||||
"""Call Gemini. Rejects Reference. Falls back to subdomain heuristic if needed."""
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
title=title or "(untitled)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=str(content)[:400] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for retry in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
domains = [
|
||||
d for d in data.get("domain", [])
|
||||
if d in CANONICAL_DOMAINS # strips Reference automatically
|
||||
]
|
||||
if domains:
|
||||
return domains
|
||||
# Gemini returned Reference or empty — try once more with stronger wording
|
||||
if retry == 0:
|
||||
continue
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
|
||||
else:
|
||||
break
|
||||
|
||||
# Last resort: subdomain keyword heuristic
|
||||
return subdomain_fallback(subdomains)
|
||||
|
||||
SUBDOMAIN_FALLBACK_MAP = [
|
||||
(["first aid", "trauma", "wound", "anatomy", "pharmacol", "herbal", "medicin", "veterinar", "dental", "surgery"], "Medical"),
|
||||
(["foraging", "hunting", "fishing", "livestock", "permaculture", "soil", "agroforestry", "mycolog", "mushroom"], "Sustainment Systems"),
|
||||
(["canning", "preservation", "fermentation", "food storage", "dehydrat"], "Food Systems"),
|
||||
(["solar", "battery", "generator", "inverter", "wind turbine", "photovoltaic"], "Power Systems"),
|
||||
(["water purif", "filtration", "sanitation", "well", "rainwater"], "Water Systems"),
|
||||
(["radio", "antenna", "mesh", "sigint", "amateur radio", "meshtastic"], "Communications"),
|
||||
(["weapon", "firearm", "tactic", "nbc", "chemical warfare", "ballistic"], "Defense & Tactics"),
|
||||
(["navigation", "compass", "land nav", "celestial"], "Navigation"),
|
||||
(["blacksmith", "woodwork", "masonry", "construct", "craft", "pottery"], "Foundational Skills"),
|
||||
(["biogas", "gasif", "rocket stove", "appropriate tech"], "Off-Grid Systems"),
|
||||
(["disaster", "emergency prep", "evacuation", "scenario"], "Scenario Playbooks"),
|
||||
(["leadership", "governance", "community"], "Leadership"),
|
||||
(["logistics", "supply chain", "transport"], "Logistics"),
|
||||
(["security", "perimeter", "surveillance"], "Security"),
|
||||
]
|
||||
|
||||
def subdomain_fallback(subdomains):
|
||||
combined = " ".join(s.lower() for s in subdomains)
|
||||
for keywords, domain in SUBDOMAIN_FALLBACK_MAP:
|
||||
if any(kw in combined for kw in keywords):
|
||||
return [domain]
|
||||
return ["Foundational Skills"] # absolute last resort
|
||||
|
||||
def update_concept_json(doc_hash, title, new_domains):
|
||||
"""Update domain in concept JSON files on disk."""
|
||||
doc_dir = CONCEPTS_DIR / doc_hash
|
||||
if not doc_dir.exists():
|
||||
return False
|
||||
for wf in doc_dir.glob("window_*.json"):
|
||||
try:
|
||||
with open(wf, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
changed = False
|
||||
for c in concepts:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
if c.get("title") == title:
|
||||
raw = c.get("domain", [])
|
||||
if isinstance(raw, str):
|
||||
raw = [raw]
|
||||
if "Reference" in raw or not [d for d in raw if d in CANONICAL_DOMAINS]:
|
||||
c["domain"] = new_domains
|
||||
changed = True
|
||||
if changed:
|
||||
with open(wf, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def process_point(point, qdrant, collection, key_rotator, dry_run):
|
||||
payload = point.payload
|
||||
title = payload.get("title", "")
|
||||
subdomains = payload.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
content = payload.get("content", payload.get("summary", ""))
|
||||
doc_hash = payload.get("doc_hash", "")
|
||||
|
||||
key = key_rotator.next()
|
||||
new_domains = classify(title, subdomains, content, key)
|
||||
|
||||
if dry_run:
|
||||
return "would_classify"
|
||||
|
||||
# Update Qdrant payload
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"domain": new_domains},
|
||||
points=[point.id],
|
||||
)
|
||||
|
||||
# Update JSON on disk
|
||||
if doc_hash:
|
||||
update_concept_json(doc_hash, title, new_domains)
|
||||
|
||||
return "ok"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = get_config()
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
collection = config['vector_db']['collection']
|
||||
|
||||
log.info("Scrolling Qdrant for Reference-tagged concepts...")
|
||||
|
||||
# Scroll all points containing Reference in domain
|
||||
offset = None
|
||||
reference_points = []
|
||||
while True:
|
||||
results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(
|
||||
key="domain",
|
||||
match=MatchAny(any=["Reference"])
|
||||
)]
|
||||
),
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
reference_points.extend(results)
|
||||
if offset is None:
|
||||
break
|
||||
if args.limit and len(reference_points) >= args.limit:
|
||||
reference_points = reference_points[:args.limit]
|
||||
break
|
||||
|
||||
total = len(reference_points)
|
||||
log.info(f"Found {total:,} Reference-tagged vectors")
|
||||
log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
|
||||
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f}")
|
||||
|
||||
if args.dry_run:
|
||||
log.info(f"DRY RUN: would re-classify {total:,} concepts. Exiting.")
|
||||
return
|
||||
|
||||
results = defaultdict(int)
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
start = time.time()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {
|
||||
ex.submit(process_point, p, qdrant, collection, rotator, False): p
|
||||
for p in reference_points
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
status = future.result()
|
||||
with lock:
|
||||
results[status] += 1
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = done / elapsed * 60
|
||||
eta = (total - done) / (done / elapsed) / 60
|
||||
log.info(f" {done:,}/{total:,} | {rate:.0f}/min | ETA {eta:.0f}min | {dict(results)}")
|
||||
time.sleep(0.02)
|
||||
|
||||
elapsed = time.time() - start
|
||||
log.info(f"\nComplete in {elapsed/60:.1f}min:")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<20} {count:>10,}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
315
scripts/repair_corrupted.py
Executable file
315
scripts/repair_corrupted.py
Executable file
|
|
@ -0,0 +1,315 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
repair_corrupted.py — Repairs window files corrupted by concurrent writes.
|
||||
|
||||
Strategy:
|
||||
1. Read corrupted_windows.txt to get the list of bad files
|
||||
2. For each bad file, identify the parent doc hash from the path
|
||||
3. Check if the text directory still exists for that doc
|
||||
4. If yes: re-run Gemini enrichment on just that window
|
||||
5. If no text: mark as unrecoverable
|
||||
6. Report summary
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/repair_corrupted.py [--dry-run] [--workers 8]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import re
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
|
||||
CORRUPTED_LIST = Path("/opt/recon/data/corrupted_windows.txt")
|
||||
TEXT_DIR = Path("/opt/recon/data/text")
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
LOG_FILE = Path("/opt/recon/logs/repair_corrupted.log")
|
||||
UNRECOVERABLE_LOG = Path("/opt/recon/data/unrecoverable_windows.txt")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler(),
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("repair_corrupted")
|
||||
|
||||
CANONICAL_DOMAINS = [
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
]
|
||||
|
||||
ENRICH_PROMPT = """Extract knowledge concepts from this document text.
|
||||
|
||||
A concept is a SELF-CONTAINED piece of knowledge that can stand alone.
|
||||
|
||||
For each concept, provide ALL fields:
|
||||
|
||||
Required:
|
||||
- content: Full text of the concept (complete procedure, definition, etc.)
|
||||
- summary: 1-2 sentence summary
|
||||
- title: Brief descriptive title
|
||||
- domain: Array of 1-5 from ONLY these exact strings (no others):
|
||||
Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
|
||||
Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
|
||||
Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
|
||||
CRITICAL: Do NOT use "Reference". Every concept belongs somewhere specific.
|
||||
- subdomain: Array of specific subcategories (up to 10)
|
||||
- keywords: Array of 3-30 searchable terms
|
||||
- skill_level: novice | intermediate | advanced
|
||||
- key_facts: Array of specific extractable claims, measurements, data points
|
||||
|
||||
Optional (include when present):
|
||||
- scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki
|
||||
- cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination
|
||||
- chapter: Chapter name if identifiable
|
||||
- page_ref: Page reference
|
||||
|
||||
Return JSON array. If no extractable concepts, return [].
|
||||
|
||||
Document text:
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
env = Path("/opt/recon/.env")
|
||||
keys = []
|
||||
for line in env.read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def repair_json_truncated(text):
|
||||
"""Last-ditch attempt to salvage a truncated JSON array."""
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
|
||||
text = re.sub(r',\s*([}\]])', r'\1', text)
|
||||
try:
|
||||
return json.loads(text)
|
||||
except Exception:
|
||||
pass
|
||||
# Find last complete object
|
||||
last_close = -1
|
||||
depth = 0
|
||||
in_str = False
|
||||
esc = False
|
||||
for i, ch in enumerate(text):
|
||||
if esc:
|
||||
esc = False; continue
|
||||
if ch == '\\' and in_str:
|
||||
esc = True; continue
|
||||
if ch == '"' and not esc:
|
||||
in_str = not in_str; continue
|
||||
if in_str:
|
||||
continue
|
||||
if ch == '{': depth += 1
|
||||
elif ch == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
last_close = i
|
||||
if last_close > 0:
|
||||
trimmed = text[:last_close + 1].rstrip().rstrip(',')
|
||||
open_brackets = trimmed.count('[') - trimmed.count(']')
|
||||
try:
|
||||
return json.loads(trimmed + ']' * open_brackets)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def enrich_window_text(text, key):
|
||||
"""Call Gemini on raw window text, return concepts list."""
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for attempt in range(4):
|
||||
try:
|
||||
resp = model.generate_content(ENRICH_PROMPT + text)
|
||||
raw = resp.text
|
||||
try:
|
||||
result = json.loads(raw)
|
||||
except Exception:
|
||||
result = repair_json_truncated(raw)
|
||||
if isinstance(result, list):
|
||||
return [c for c in result if isinstance(c, dict)]
|
||||
elif isinstance(result, dict):
|
||||
return [result]
|
||||
return []
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
log.warning(f" Non-transient error: {e}")
|
||||
break
|
||||
return None # failed
|
||||
|
||||
def get_window_text(doc_hash, window_filename):
|
||||
"""Reconstruct window text from page files."""
|
||||
# Window filename: window_NNNN.json -> window index is NNNN
|
||||
try:
|
||||
w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
|
||||
except (IndexError, ValueError):
|
||||
return None
|
||||
|
||||
text_path = TEXT_DIR / doc_hash
|
||||
if not text_path.exists():
|
||||
return None
|
||||
|
||||
page_files = sorted([
|
||||
f for f in text_path.iterdir()
|
||||
if f.name.startswith('page_') and f.name.endswith('.txt')
|
||||
])
|
||||
if not page_files:
|
||||
return None
|
||||
|
||||
# Re-derive which pages this window covered (window_size=5 from config)
|
||||
window_size = 5
|
||||
start = w_idx * window_size
|
||||
window_pages = page_files[start:start + window_size]
|
||||
if not window_pages:
|
||||
return None
|
||||
|
||||
parts = []
|
||||
for j, pf in enumerate(window_pages):
|
||||
try:
|
||||
text = pf.read_text(encoding='utf-8')
|
||||
parts.append(f"--- Page {start + j + 1} ---\n{text}")
|
||||
except Exception:
|
||||
pass
|
||||
return "\n\n".join(parts) if parts else None
|
||||
|
||||
def repair_file(corrupted_path, key_rotator, dry_run):
|
||||
"""Attempt to repair a single corrupted window file."""
|
||||
path = Path(corrupted_path)
|
||||
|
||||
# Sanity check -- maybe it fixed itself somehow
|
||||
try:
|
||||
with open(path) as f:
|
||||
existing = json.load(f)
|
||||
return "already_valid"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract doc hash and window name from path structure
|
||||
# Expected: /opt/recon/data/concepts/{hash}/window_NNNN.json
|
||||
doc_hash = path.parent.name
|
||||
window_filename = path.name
|
||||
|
||||
# Get source text for this window
|
||||
window_text = get_window_text(doc_hash, window_filename)
|
||||
if not window_text:
|
||||
return "no_source_text"
|
||||
|
||||
if dry_run:
|
||||
return "would_repair"
|
||||
|
||||
# Re-enrich from source text
|
||||
key = key_rotator.next()
|
||||
concepts = enrich_window_text(window_text, key)
|
||||
|
||||
if concepts is None:
|
||||
return "enrichment_failed"
|
||||
|
||||
# Tag concepts with metadata
|
||||
try:
|
||||
w_idx = int(Path(window_filename).stem.split('_')[1]) - 1
|
||||
window_size = 5
|
||||
start_page = w_idx * window_size + 1
|
||||
except Exception:
|
||||
w_idx = 0
|
||||
start_page = 0
|
||||
|
||||
for c in concepts:
|
||||
c['_window'] = w_idx + 1
|
||||
c['_start_page'] = start_page
|
||||
c['_doc_hash'] = doc_hash
|
||||
c['_repaired'] = True
|
||||
|
||||
# Write repaired file
|
||||
try:
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
return "repaired"
|
||||
except Exception as e:
|
||||
return "write_error"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--workers", type=int, default=8)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not CORRUPTED_LIST.exists():
|
||||
log.error(f"Corrupted list not found: {CORRUPTED_LIST}")
|
||||
log.error("Run Task 1 first to generate it.")
|
||||
return
|
||||
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
corrupted = []
|
||||
with open(CORRUPTED_LIST) as f:
|
||||
for line in f:
|
||||
parts = line.strip().split('\t')
|
||||
if parts:
|
||||
corrupted.append(parts[0])
|
||||
|
||||
log.info(f"Repairing {len(corrupted):,} corrupted window files")
|
||||
log.info(f"Dry run: {args.dry_run} | Workers: {args.workers} | Keys: {len(keys)}")
|
||||
|
||||
results = defaultdict(int)
|
||||
unrecoverable = []
|
||||
lock = threading.Lock()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {ex.submit(repair_file, p, rotator, args.dry_run): p for p in corrupted}
|
||||
done = 0
|
||||
for future in as_completed(futures):
|
||||
path = futures[future]
|
||||
status = future.result()
|
||||
with lock:
|
||||
results[status] += 1
|
||||
if status in ("no_source_text", "enrichment_failed", "write_error"):
|
||||
unrecoverable.append((path, status))
|
||||
done += 1
|
||||
if done % 100 == 0:
|
||||
log.info(f" {done:,}/{len(corrupted):,} | {dict(results)}")
|
||||
time.sleep(0.05)
|
||||
|
||||
log.info("── Results ─────────────────────────────────────────────────")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<25} {count:>8,}")
|
||||
|
||||
if unrecoverable:
|
||||
with open(UNRECOVERABLE_LOG, 'w') as f:
|
||||
for path, reason in unrecoverable:
|
||||
f.write(f"{path}\t{reason}\n")
|
||||
log.info(f"\n Unrecoverable: {len(unrecoverable)} — logged to {UNRECOVERABLE_LOG}")
|
||||
else:
|
||||
log.info("\n All files repaired successfully.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
178
scripts/validate.py
Executable file
178
scripts/validate.py
Executable file
|
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
RECON Pipeline Validator
|
||||
|
||||
Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.
|
||||
Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.
|
||||
|
||||
Usage: python3 scripts/validate.py [--deep]
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from lib.utils import get_config, setup_logging
|
||||
from lib.status import StatusDB
|
||||
|
||||
logger = setup_logging('recon.validate')
|
||||
|
||||
|
||||
def run_validation(deep=False):
|
||||
config = get_config()
|
||||
db = StatusDB()
|
||||
|
||||
issues = []
|
||||
warnings = []
|
||||
|
||||
print("=== RECON Validation ===\n")
|
||||
|
||||
# Check paths
|
||||
for name, path in config['paths'].items():
|
||||
if name == 'db':
|
||||
if not os.path.exists(path):
|
||||
issues.append(f"Database not found: {path}")
|
||||
else:
|
||||
if not os.path.exists(path):
|
||||
warnings.append(f"Directory missing: {name} = {path}")
|
||||
|
||||
# Check library
|
||||
if not os.path.exists(config['library_root']):
|
||||
issues.append(f"Library root not found: {config['library_root']}")
|
||||
|
||||
# Check Gemini keys
|
||||
keys = config.get('gemini_keys', [])
|
||||
if not keys:
|
||||
warnings.append("No Gemini API keys configured in .env")
|
||||
else:
|
||||
print(f" Gemini keys: {len(keys)} configured")
|
||||
|
||||
# DB status counts
|
||||
counts = db.get_status_counts()
|
||||
cat = counts.get('catalogue', {})
|
||||
doc = counts.get('documents', {})
|
||||
|
||||
print(f" Catalogue: {sum(cat.values())} entries")
|
||||
print(f" Documents: {sum(doc.values())} entries")
|
||||
print(f" Complete: {doc.get('complete', 0)}")
|
||||
print(f" Failed: {doc.get('failed', 0)}")
|
||||
|
||||
if deep:
|
||||
print("\n--- Deep Validation ---\n")
|
||||
|
||||
# Check every document in pipeline has corresponding files
|
||||
all_docs = db.get_all_documents()
|
||||
text_dir = config['paths']['text']
|
||||
concepts_dir = config['paths']['concepts']
|
||||
|
||||
for d in all_docs:
|
||||
h = d['hash']
|
||||
status = d['status']
|
||||
|
||||
if status in ('extracted', 'enriched', 'complete'):
|
||||
doc_text_dir = os.path.join(text_dir, h)
|
||||
if not os.path.exists(doc_text_dir):
|
||||
issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")
|
||||
elif deep:
|
||||
pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]
|
||||
if not pages:
|
||||
issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")
|
||||
|
||||
if status in ('enriched', 'complete'):
|
||||
doc_concepts_dir = os.path.join(concepts_dir, h)
|
||||
if not os.path.exists(doc_concepts_dir):
|
||||
issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")
|
||||
elif deep:
|
||||
windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]
|
||||
if not windows:
|
||||
issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")
|
||||
else:
|
||||
for wf in windows:
|
||||
try:
|
||||
with open(os.path.join(doc_concepts_dir, wf)) as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
issues.append(f"[{h[:8]}] {wf}: not a JSON array")
|
||||
except json.JSONDecodeError:
|
||||
issues.append(f"[{h[:8]}] {wf}: invalid JSON")
|
||||
|
||||
# Check orphaned directories
|
||||
if os.path.exists(text_dir):
|
||||
doc_hashes = {d['hash'] for d in all_docs}
|
||||
for dirname in os.listdir(text_dir):
|
||||
if dirname not in doc_hashes:
|
||||
warnings.append(f"Orphaned text dir: {dirname}")
|
||||
|
||||
if os.path.exists(concepts_dir):
|
||||
for dirname in os.listdir(concepts_dir):
|
||||
if dirname not in doc_hashes:
|
||||
warnings.append(f"Orphaned concepts dir: {dirname}")
|
||||
|
||||
print(f" Checked {len(all_docs)} documents")
|
||||
|
||||
# Connectivity checks
|
||||
print("\n--- Connectivity ---\n")
|
||||
|
||||
import requests as http_requests
|
||||
|
||||
# Check TEI (primary embedding backend)
|
||||
try:
|
||||
tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"
|
||||
resp = http_requests.get(tei_url, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
print(f" TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")
|
||||
else:
|
||||
issues.append(f"TEI: HTTP {resp.status_code}")
|
||||
except Exception as e:
|
||||
issues.append(f"TEI: unreachable ({e})")
|
||||
|
||||
# Check Ollama (fallback)
|
||||
try:
|
||||
ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"
|
||||
resp = http_requests.get(ollama_url, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
print(f" Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")
|
||||
else:
|
||||
warnings.append(f"Ollama: HTTP {resp.status_code}")
|
||||
except Exception as e:
|
||||
warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=10
|
||||
)
|
||||
collections = [c.name for c in qdrant.get_collections().collections]
|
||||
target = config['vector_db']['collection']
|
||||
if target in collections:
|
||||
info = qdrant.get_collection(target)
|
||||
print(f" Qdrant: OK ({target}: {info.points_count} points)")
|
||||
else:
|
||||
issues.append(f"Qdrant: collection {target} not found")
|
||||
except Exception as e:
|
||||
issues.append(f"Qdrant: unreachable ({e})")
|
||||
|
||||
# Summary
|
||||
print("\n--- Summary ---\n")
|
||||
|
||||
if warnings:
|
||||
print(f"Warnings ({len(warnings)}):")
|
||||
for w in warnings:
|
||||
print(f" ⚠ {w}")
|
||||
|
||||
if issues:
|
||||
print(f"\nIssues ({len(issues)}):")
|
||||
for i in issues:
|
||||
print(f" ✗ {i}")
|
||||
print(f"\nValidation FAILED: {len(issues)} issue(s)")
|
||||
else:
|
||||
print("Validation PASSED")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
deep = '--deep' in sys.argv
|
||||
run_validation(deep=deep)
|
||||
Loading…
Add table
Add a link
Reference in a new issue