mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
314
scripts/reenrich_reference.py
Executable file
314
scripts/reenrich_reference.py
Executable file
|
|
@ -0,0 +1,314 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
reenrich_reference.py — Re-classifies all remaining Reference-tagged concepts.
|
||||
|
||||
Scrolls Qdrant for vectors with domain == ["Reference"] or containing "Reference",
|
||||
calls Gemini with a hardened prompt that rejects Reference as a valid response,
|
||||
updates both Qdrant payload and concept JSON on disk.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/reenrich_reference.py [--dry-run] [--workers 16] [--limit N]
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import FieldCondition, MatchAny, Filter
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/recon')
|
||||
from lib.utils import get_config, setup_logging
|
||||
|
||||
LOG_FILE = Path("/opt/recon/logs/reenrich_reference.log")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("reenrich_reference")
|
||||
|
||||
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||||
|
||||
CANONICAL_DOMAINS = {
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
}
|
||||
|
||||
# Hardened prompt — Reference explicitly forbidden, classification rules detailed
|
||||
CLASSIFY_PROMPT = """\
|
||||
You are a knowledge classification engine. Classify this concept into its correct domain.
|
||||
|
||||
VALID DOMAINS — use ONLY these exact strings:
|
||||
Defense & Tactics
|
||||
Sustainment Systems
|
||||
Off-Grid Systems
|
||||
Foundational Skills
|
||||
Communications
|
||||
Medical
|
||||
Food Systems
|
||||
Navigation
|
||||
Logistics
|
||||
Power Systems
|
||||
Leadership
|
||||
Scenario Playbooks
|
||||
Water Systems
|
||||
Security
|
||||
Community Coordination
|
||||
|
||||
FORBIDDEN: Do NOT output "Reference" under any circumstances. It is not a valid domain.
|
||||
FORBIDDEN: Do NOT output an empty domain list.
|
||||
|
||||
CLASSIFICATION RULES:
|
||||
- First aid, anatomy, pharmacology, herbs, veterinary, austere medicine, wound care → Medical
|
||||
- Food growing, foraging, hunting, fishing, animal husbandry, livestock → Sustainment Systems
|
||||
- Food preservation, canning, fermentation, food storage, dehydrating → Food Systems
|
||||
- Solar, wind, hydro, batteries, generators, inverters, charge controllers → Power Systems
|
||||
- Water sourcing, filtration, purification, sanitation, wells, rainwater → Water Systems
|
||||
- Radio, antennas, mesh networking, SIGINT, amateur radio → Communications
|
||||
- Weapons, tactics, NBC, security operations, field craft → Defense & Tactics
|
||||
- Permaculture, soil science, agroforestry, composting → Sustainment Systems
|
||||
- Shelter, construction, masonry, blacksmithing, woodworking, crafts → Foundational Skills
|
||||
- Navigation, land nav, celestial nav, map reading, compass → Navigation
|
||||
- Emergency planning, disaster prep, scenario planning → Scenario Playbooks
|
||||
- Leadership, governance, community organization → Leadership
|
||||
- Supply chain, transportation, inventory → Logistics
|
||||
- Physical security, perimeter, surveillance → Security
|
||||
- Community building, cooperation, mutual aid → Community Coordination
|
||||
- Biogas, wood gasification, rocket stoves, appropriate technology → Off-Grid Systems
|
||||
|
||||
If uncertain between two domains, pick the most actionable one for a self-reliant household.
|
||||
|
||||
Concept title: {title}
|
||||
Concept subdomain tags: {subdomain}
|
||||
Concept content: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown, no explanation:
|
||||
{{"domain": ["Domain Name"]}}
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
keys = []
|
||||
for line in Path("/opt/recon/.env").read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def classify(title, subdomains, content, key, attempt=0):
|
||||
"""Call Gemini. Rejects Reference. Falls back to subdomain heuristic if needed."""
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
title=title or "(untitled)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=str(content)[:400] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for retry in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
domains = [
|
||||
d for d in data.get("domain", [])
|
||||
if d in CANONICAL_DOMAINS # strips Reference automatically
|
||||
]
|
||||
if domains:
|
||||
return domains
|
||||
# Gemini returned Reference or empty — try once more with stronger wording
|
||||
if retry == 0:
|
||||
continue
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
time.sleep(min(5 * (2 ** retry) + random.uniform(0, 3), 60))
|
||||
else:
|
||||
break
|
||||
|
||||
# Last resort: subdomain keyword heuristic
|
||||
return subdomain_fallback(subdomains)
|
||||
|
||||
SUBDOMAIN_FALLBACK_MAP = [
|
||||
(["first aid", "trauma", "wound", "anatomy", "pharmacol", "herbal", "medicin", "veterinar", "dental", "surgery"], "Medical"),
|
||||
(["foraging", "hunting", "fishing", "livestock", "permaculture", "soil", "agroforestry", "mycolog", "mushroom"], "Sustainment Systems"),
|
||||
(["canning", "preservation", "fermentation", "food storage", "dehydrat"], "Food Systems"),
|
||||
(["solar", "battery", "generator", "inverter", "wind turbine", "photovoltaic"], "Power Systems"),
|
||||
(["water purif", "filtration", "sanitation", "well", "rainwater"], "Water Systems"),
|
||||
(["radio", "antenna", "mesh", "sigint", "amateur radio", "meshtastic"], "Communications"),
|
||||
(["weapon", "firearm", "tactic", "nbc", "chemical warfare", "ballistic"], "Defense & Tactics"),
|
||||
(["navigation", "compass", "land nav", "celestial"], "Navigation"),
|
||||
(["blacksmith", "woodwork", "masonry", "construct", "craft", "pottery"], "Foundational Skills"),
|
||||
(["biogas", "gasif", "rocket stove", "appropriate tech"], "Off-Grid Systems"),
|
||||
(["disaster", "emergency prep", "evacuation", "scenario"], "Scenario Playbooks"),
|
||||
(["leadership", "governance", "community"], "Leadership"),
|
||||
(["logistics", "supply chain", "transport"], "Logistics"),
|
||||
(["security", "perimeter", "surveillance"], "Security"),
|
||||
]
|
||||
|
||||
def subdomain_fallback(subdomains):
|
||||
combined = " ".join(s.lower() for s in subdomains)
|
||||
for keywords, domain in SUBDOMAIN_FALLBACK_MAP:
|
||||
if any(kw in combined for kw in keywords):
|
||||
return [domain]
|
||||
return ["Foundational Skills"] # absolute last resort
|
||||
|
||||
def update_concept_json(doc_hash, title, new_domains):
|
||||
"""Update domain in concept JSON files on disk."""
|
||||
doc_dir = CONCEPTS_DIR / doc_hash
|
||||
if not doc_dir.exists():
|
||||
return False
|
||||
for wf in doc_dir.glob("window_*.json"):
|
||||
try:
|
||||
with open(wf, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
changed = False
|
||||
for c in concepts:
|
||||
if not isinstance(c, dict):
|
||||
continue
|
||||
if c.get("title") == title:
|
||||
raw = c.get("domain", [])
|
||||
if isinstance(raw, str):
|
||||
raw = [raw]
|
||||
if "Reference" in raw or not [d for d in raw if d in CANONICAL_DOMAINS]:
|
||||
c["domain"] = new_domains
|
||||
changed = True
|
||||
if changed:
|
||||
with open(wf, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def process_point(point, qdrant, collection, key_rotator, dry_run):
|
||||
payload = point.payload
|
||||
title = payload.get("title", "")
|
||||
subdomains = payload.get("subdomain", [])
|
||||
if isinstance(subdomains, str):
|
||||
subdomains = [subdomains]
|
||||
content = payload.get("content", payload.get("summary", ""))
|
||||
doc_hash = payload.get("doc_hash", "")
|
||||
|
||||
key = key_rotator.next()
|
||||
new_domains = classify(title, subdomains, content, key)
|
||||
|
||||
if dry_run:
|
||||
return "would_classify"
|
||||
|
||||
# Update Qdrant payload
|
||||
qdrant.set_payload(
|
||||
collection_name=collection,
|
||||
payload={"domain": new_domains},
|
||||
points=[point.id],
|
||||
)
|
||||
|
||||
# Update JSON on disk
|
||||
if doc_hash:
|
||||
update_concept_json(doc_hash, title, new_domains)
|
||||
|
||||
return "ok"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = get_config()
|
||||
keys = load_gemini_keys()
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
collection = config['vector_db']['collection']
|
||||
|
||||
log.info("Scrolling Qdrant for Reference-tagged concepts...")
|
||||
|
||||
# Scroll all points containing Reference in domain
|
||||
offset = None
|
||||
reference_points = []
|
||||
while True:
|
||||
results, offset = qdrant.scroll(
|
||||
collection_name=collection,
|
||||
scroll_filter=Filter(
|
||||
must=[FieldCondition(
|
||||
key="domain",
|
||||
match=MatchAny(any=["Reference"])
|
||||
)]
|
||||
),
|
||||
limit=1000,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
reference_points.extend(results)
|
||||
if offset is None:
|
||||
break
|
||||
if args.limit and len(reference_points) >= args.limit:
|
||||
reference_points = reference_points[:args.limit]
|
||||
break
|
||||
|
||||
total = len(reference_points)
|
||||
log.info(f"Found {total:,} Reference-tagged vectors")
|
||||
log.info(f"Workers: {args.workers} | Keys: {len(keys)} | Dry run: {args.dry_run}")
|
||||
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f}")
|
||||
|
||||
if args.dry_run:
|
||||
log.info(f"DRY RUN: would re-classify {total:,} concepts. Exiting.")
|
||||
return
|
||||
|
||||
results = defaultdict(int)
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
start = time.time()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {
|
||||
ex.submit(process_point, p, qdrant, collection, rotator, False): p
|
||||
for p in reference_points
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
status = future.result()
|
||||
with lock:
|
||||
results[status] += 1
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
elapsed = time.time() - start
|
||||
rate = done / elapsed * 60
|
||||
eta = (total - done) / (done / elapsed) / 60
|
||||
log.info(f" {done:,}/{total:,} | {rate:.0f}/min | ETA {eta:.0f}min | {dict(results)}")
|
||||
time.sleep(0.02)
|
||||
|
||||
elapsed = time.time() - start
|
||||
log.info(f"\nComplete in {elapsed/60:.1f}min:")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<20} {count:>10,}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue