mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
215
scripts/domain_reenrich.py
Executable file
215
scripts/domain_reenrich.py
Executable file
|
|
@ -0,0 +1,215 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
domain_reenrich.py — Re-enriches solo-Reference concepts that domain_remap.py
|
||||
couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
|
||||
with a lightweight classification-only prompt, updates domain in-place.
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
|
||||
|
||||
Reads: /opt/recon/data/remap_unknowns.jsonl
|
||||
Writes: domain field in-place in window JSON files
|
||||
Log: /opt/recon/logs/domain_reenrich.log
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import argparse
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from collections import defaultdict
|
||||
|
||||
import google.generativeai as genai
|
||||
|
||||
UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
|
||||
LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler(),
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("domain_reenrich")
|
||||
|
||||
CANONICAL_DOMAINS = [
|
||||
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||||
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||||
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||||
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||||
]
|
||||
|
||||
DOMAIN_SET = set(CANONICAL_DOMAINS)
|
||||
|
||||
CLASSIFY_PROMPT = """\
|
||||
Classify this knowledge concept into one or more domains.
|
||||
|
||||
VALID DOMAINS (use ONLY these exact strings, no others):
|
||||
{domains}
|
||||
|
||||
Concept title: {title}
|
||||
Concept tags: {subdomain}
|
||||
Concept preview: {content}
|
||||
|
||||
Return ONLY valid JSON, no markdown, no explanation:
|
||||
{{"domain": ["Domain Name"]}}
|
||||
|
||||
Rules:
|
||||
- Use only the domain strings listed above, spelled exactly
|
||||
- If genuinely multi-domain assign all that apply
|
||||
- Never return empty domain list — pick the closest match
|
||||
- Medical content, herbs, first aid, veterinary → Medical
|
||||
- Food growing, foraging, hunting, livestock → Sustainment Systems
|
||||
- Food preservation, canning, storage → Food Systems
|
||||
- Solar, wind, batteries, generators → Power Systems
|
||||
- Water sourcing, filtration, sanitation → Water Systems
|
||||
"""
|
||||
|
||||
def load_gemini_keys():
|
||||
env = Path("/opt/recon/.env")
|
||||
keys = []
|
||||
for line in env.read_text().splitlines():
|
||||
if line.startswith("GEMINI_KEY_"):
|
||||
keys.append(line.split("=", 1)[1].strip())
|
||||
return keys
|
||||
|
||||
class KeyRotator:
|
||||
def __init__(self, keys):
|
||||
self.keys = keys
|
||||
self._i = 0
|
||||
self._lock = threading.Lock()
|
||||
def next(self):
|
||||
with self._lock:
|
||||
key = self.keys[self._i % len(self.keys)]
|
||||
self._i += 1
|
||||
return key
|
||||
|
||||
def classify_concept(title, subdomains, content, key):
|
||||
prompt = CLASSIFY_PROMPT.format(
|
||||
domains="\n".join(f" {d}" for d in CANONICAL_DOMAINS),
|
||||
title=title or "(untitled)",
|
||||
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||||
content=content[:300] if content else "(none)",
|
||||
)
|
||||
genai.configure(api_key=key)
|
||||
model = genai.GenerativeModel(
|
||||
"gemini-2.0-flash",
|
||||
generation_config={"response_mime_type": "application/json"}
|
||||
)
|
||||
for attempt in range(4):
|
||||
try:
|
||||
resp = model.generate_content(prompt)
|
||||
data = json.loads(resp.text)
|
||||
domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
|
||||
if domains:
|
||||
return domains
|
||||
except Exception as e:
|
||||
err = str(e).lower()
|
||||
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||||
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
|
||||
time.sleep(delay)
|
||||
else:
|
||||
break
|
||||
return ["Foundational Skills"] # last-resort fallback
|
||||
|
||||
def process_unknown(item, key_rotator):
|
||||
filepath = Path(item["filepath"])
|
||||
title = item.get("title", "")
|
||||
subdomains = item.get("subdomain", [])
|
||||
content = item.get("content_preview", "")
|
||||
|
||||
if not filepath.exists():
|
||||
return "file_missing"
|
||||
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
concepts = json.load(f)
|
||||
except Exception:
|
||||
return "read_error"
|
||||
|
||||
if not isinstance(concepts, list):
|
||||
return "not_list"
|
||||
|
||||
# Find this concept by title and update its domain
|
||||
matched = False
|
||||
for concept in concepts:
|
||||
if not isinstance(concept, dict):
|
||||
continue
|
||||
if concept.get("title", "") == title:
|
||||
raw = concept.get("domain", [])
|
||||
if isinstance(raw, str):
|
||||
raw = [raw]
|
||||
# Only re-enrich if still stuck on Reference
|
||||
if raw == ["Reference"] or raw == []:
|
||||
key = key_rotator.next()
|
||||
new_domains = classify_concept(title, subdomains, content, key)
|
||||
concept["domain"] = new_domains
|
||||
concept["_reenriched"] = True
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
return "already_fixed"
|
||||
|
||||
try:
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||||
except Exception:
|
||||
return "write_error"
|
||||
|
||||
return "ok"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--workers", type=int, default=16)
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
keys = load_gemini_keys()
|
||||
if not keys:
|
||||
log.error("No Gemini keys found in .env")
|
||||
return
|
||||
rotator = KeyRotator(keys)
|
||||
|
||||
unknowns = []
|
||||
with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
unknowns.append(json.loads(line))
|
||||
|
||||
if args.limit:
|
||||
unknowns = unknowns[:args.limit]
|
||||
|
||||
total = len(unknowns)
|
||||
log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
|
||||
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
|
||||
|
||||
results = defaultdict(int)
|
||||
lock = threading.Lock()
|
||||
done = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||||
futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
|
||||
for future in as_completed(futures):
|
||||
status = future.result()
|
||||
with lock:
|
||||
results[status] += 1
|
||||
done += 1
|
||||
if done % 5000 == 0:
|
||||
pct = done / total * 100
|
||||
log.info(f" Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
|
||||
time.sleep(0.05)
|
||||
|
||||
log.info("── Final Results ─────────────────────────────────────────────")
|
||||
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||||
log.info(f" {status:<25} {count:>10,}")
|
||||
log.info(f" Total: {total:,}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue