mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
215 lines
6.9 KiB
Python
215 lines
6.9 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
domain_reenrich.py — Re-enriches solo-Reference concepts that domain_remap.py
|
||
|
|
couldn't fix via subdomain lookup. Reads remap_unknowns.jsonl, calls Gemini
|
||
|
|
with a lightweight classification-only prompt, updates domain in-place.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 /opt/recon/scripts/domain_reenrich.py [--workers 16] [--limit N]
|
||
|
|
|
||
|
|
Reads: /opt/recon/data/remap_unknowns.jsonl
|
||
|
|
Writes: domain field in-place in window JSON files
|
||
|
|
Log: /opt/recon/logs/domain_reenrich.log
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
import random
|
||
|
|
import logging
|
||
|
|
import argparse
|
||
|
|
import threading
|
||
|
|
from pathlib import Path
|
||
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
import google.generativeai as genai
|
||
|
|
|
||
|
|
UNKNOWNS_FILE = Path("/opt/recon/data/remap_unknowns.jsonl")
|
||
|
|
LOG_FILE = Path("/opt/recon/logs/domain_reenrich.log")
|
||
|
|
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
||
|
|
handlers=[
|
||
|
|
logging.FileHandler(LOG_FILE),
|
||
|
|
logging.StreamHandler(),
|
||
|
|
]
|
||
|
|
)
|
||
|
|
log = logging.getLogger("domain_reenrich")
|
||
|
|
|
||
|
|
CANONICAL_DOMAINS = [
|
||
|
|
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||
|
|
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||
|
|
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||
|
|
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||
|
|
]
|
||
|
|
|
||
|
|
DOMAIN_SET = set(CANONICAL_DOMAINS)
|
||
|
|
|
||
|
|
CLASSIFY_PROMPT = """\
|
||
|
|
Classify this knowledge concept into one or more domains.
|
||
|
|
|
||
|
|
VALID DOMAINS (use ONLY these exact strings, no others):
|
||
|
|
{domains}
|
||
|
|
|
||
|
|
Concept title: {title}
|
||
|
|
Concept tags: {subdomain}
|
||
|
|
Concept preview: {content}
|
||
|
|
|
||
|
|
Return ONLY valid JSON, no markdown, no explanation:
|
||
|
|
{{"domain": ["Domain Name"]}}
|
||
|
|
|
||
|
|
Rules:
|
||
|
|
- Use only the domain strings listed above, spelled exactly
|
||
|
|
- If genuinely multi-domain assign all that apply
|
||
|
|
- Never return empty domain list — pick the closest match
|
||
|
|
- Medical content, herbs, first aid, veterinary → Medical
|
||
|
|
- Food growing, foraging, hunting, livestock → Sustainment Systems
|
||
|
|
- Food preservation, canning, storage → Food Systems
|
||
|
|
- Solar, wind, batteries, generators → Power Systems
|
||
|
|
- Water sourcing, filtration, sanitation → Water Systems
|
||
|
|
"""
|
||
|
|
|
||
|
|
def load_gemini_keys():
|
||
|
|
env = Path("/opt/recon/.env")
|
||
|
|
keys = []
|
||
|
|
for line in env.read_text().splitlines():
|
||
|
|
if line.startswith("GEMINI_KEY_"):
|
||
|
|
keys.append(line.split("=", 1)[1].strip())
|
||
|
|
return keys
|
||
|
|
|
||
|
|
class KeyRotator:
|
||
|
|
def __init__(self, keys):
|
||
|
|
self.keys = keys
|
||
|
|
self._i = 0
|
||
|
|
self._lock = threading.Lock()
|
||
|
|
def next(self):
|
||
|
|
with self._lock:
|
||
|
|
key = self.keys[self._i % len(self.keys)]
|
||
|
|
self._i += 1
|
||
|
|
return key
|
||
|
|
|
||
|
|
def classify_concept(title, subdomains, content, key):
|
||
|
|
prompt = CLASSIFY_PROMPT.format(
|
||
|
|
domains="\n".join(f" {d}" for d in CANONICAL_DOMAINS),
|
||
|
|
title=title or "(untitled)",
|
||
|
|
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||
|
|
content=content[:300] if content else "(none)",
|
||
|
|
)
|
||
|
|
genai.configure(api_key=key)
|
||
|
|
model = genai.GenerativeModel(
|
||
|
|
"gemini-2.0-flash",
|
||
|
|
generation_config={"response_mime_type": "application/json"}
|
||
|
|
)
|
||
|
|
for attempt in range(4):
|
||
|
|
try:
|
||
|
|
resp = model.generate_content(prompt)
|
||
|
|
data = json.loads(resp.text)
|
||
|
|
domains = [d for d in data.get("domain", []) if d in DOMAIN_SET]
|
||
|
|
if domains:
|
||
|
|
return domains
|
||
|
|
except Exception as e:
|
||
|
|
err = str(e).lower()
|
||
|
|
if any(s in err for s in ["429", "quota", "rate", "503", "unavailable"]):
|
||
|
|
delay = min(5 * (2 ** attempt) + random.uniform(0, 3), 60)
|
||
|
|
time.sleep(delay)
|
||
|
|
else:
|
||
|
|
break
|
||
|
|
return ["Foundational Skills"] # last-resort fallback
|
||
|
|
|
||
|
|
def process_unknown(item, key_rotator):
|
||
|
|
filepath = Path(item["filepath"])
|
||
|
|
title = item.get("title", "")
|
||
|
|
subdomains = item.get("subdomain", [])
|
||
|
|
content = item.get("content_preview", "")
|
||
|
|
|
||
|
|
if not filepath.exists():
|
||
|
|
return "file_missing"
|
||
|
|
|
||
|
|
try:
|
||
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
||
|
|
concepts = json.load(f)
|
||
|
|
except Exception:
|
||
|
|
return "read_error"
|
||
|
|
|
||
|
|
if not isinstance(concepts, list):
|
||
|
|
return "not_list"
|
||
|
|
|
||
|
|
# Find this concept by title and update its domain
|
||
|
|
matched = False
|
||
|
|
for concept in concepts:
|
||
|
|
if not isinstance(concept, dict):
|
||
|
|
continue
|
||
|
|
if concept.get("title", "") == title:
|
||
|
|
raw = concept.get("domain", [])
|
||
|
|
if isinstance(raw, str):
|
||
|
|
raw = [raw]
|
||
|
|
# Only re-enrich if still stuck on Reference
|
||
|
|
if raw == ["Reference"] or raw == []:
|
||
|
|
key = key_rotator.next()
|
||
|
|
new_domains = classify_concept(title, subdomains, content, key)
|
||
|
|
concept["domain"] = new_domains
|
||
|
|
concept["_reenriched"] = True
|
||
|
|
matched = True
|
||
|
|
break
|
||
|
|
|
||
|
|
if not matched:
|
||
|
|
return "already_fixed"
|
||
|
|
|
||
|
|
try:
|
||
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||
|
|
except Exception:
|
||
|
|
return "write_error"
|
||
|
|
|
||
|
|
return "ok"
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--workers", type=int, default=16)
|
||
|
|
parser.add_argument("--limit", type=int, default=None)
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
keys = load_gemini_keys()
|
||
|
|
if not keys:
|
||
|
|
log.error("No Gemini keys found in .env")
|
||
|
|
return
|
||
|
|
rotator = KeyRotator(keys)
|
||
|
|
|
||
|
|
unknowns = []
|
||
|
|
with open(UNKNOWNS_FILE, "r", encoding="utf-8") as f:
|
||
|
|
for line in f:
|
||
|
|
line = line.strip()
|
||
|
|
if line:
|
||
|
|
unknowns.append(json.loads(line))
|
||
|
|
|
||
|
|
if args.limit:
|
||
|
|
unknowns = unknowns[:args.limit]
|
||
|
|
|
||
|
|
total = len(unknowns)
|
||
|
|
log.info(f"Re-enriching {total:,} concepts | {args.workers} workers | {len(keys)} API keys")
|
||
|
|
log.info(f"Estimated Gemini Flash cost: ~${total * 0.0004:.2f} (conservative)")
|
||
|
|
|
||
|
|
results = defaultdict(int)
|
||
|
|
lock = threading.Lock()
|
||
|
|
done = 0
|
||
|
|
|
||
|
|
with ThreadPoolExecutor(max_workers=args.workers) as ex:
|
||
|
|
futures = {ex.submit(process_unknown, item, rotator): item for item in unknowns}
|
||
|
|
for future in as_completed(futures):
|
||
|
|
status = future.result()
|
||
|
|
with lock:
|
||
|
|
results[status] += 1
|
||
|
|
done += 1
|
||
|
|
if done % 5000 == 0:
|
||
|
|
pct = done / total * 100
|
||
|
|
log.info(f" Progress: {done:,}/{total:,} ({pct:.1f}%) | {dict(results)}")
|
||
|
|
time.sleep(0.05)
|
||
|
|
|
||
|
|
log.info("── Final Results ─────────────────────────────────────────────")
|
||
|
|
for status, count in sorted(results.items(), key=lambda x: -x[1]):
|
||
|
|
log.info(f" {status:<25} {count:>10,}")
|
||
|
|
log.info(f" Total: {total:,}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|