recon/scripts/domain_remap.py

428 lines
15 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
"""
domain_remap.py Fix RECON concept domain classifications without API calls.
What this does:
1. Strips "Reference" from concepts that have other real domains
2. Remaps variant domain spellings to canonical names
3. Reclassifies solo-Reference concepts using their subdomain tags
4. Writes a JSONL file of true unknowns for API re-enrichment
Each window file is a JSON array of concept dicts.
Field names: "domain" (list), "subdomain" (list)
Usage:
python3 /opt/recon/scripts/domain_remap.py --dry-run # report only
python3 /opt/recon/scripts/domain_remap.py # apply fixes
python3 /opt/recon/scripts/domain_remap.py --workers 16
"""
import json
import argparse
import threading
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
UNKNOWNS_OUTPUT = Path("/opt/recon/data/remap_unknowns.jsonl")
CANONICAL_DOMAINS = {
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
"Foundational Skills", "Communications", "Medical", "Food Systems",
"Navigation", "Logistics", "Power Systems", "Leadership",
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
}
# Variant → Canonical mapping
VARIANT_MAP = {
# Defense & Tactics
"Tactical Ops": "Defense & Tactics",
"Tactical_Ops": "Defense & Tactics",
"Tactical Operations": "Defense & Tactics",
"Tactical": "Defense & Tactics",
"Tactical Skills": "Defense & Tactics",
"Tactics": "Defense & Tactics",
"Tactics & Defense": "Defense & Tactics",
"Reconnaissance": "Defense & Tactics",
"Fire Support": "Defense & Tactics",
"Improvised Munitions": "Defense & Tactics",
"Military Intelligence": "Defense & Tactics",
"Military History": "Defense & Tactics",
"Military Engineering": "Defense & Tactics",
# Medical
"Medical Care": "Medical",
"Medical Alternatives": "Medical",
"Medical/Dental": "Medical",
"Medical & Dental": "Medical",
"medical": "Medical",
"Medical Awareness": "Medical",
"Medical Disasters": "Medical",
"Medical Emergency Survival": "Medical",
"Medical Procedures": "Medical",
"Medical Treatment": "Medical",
"Medical Science": "Medical",
"Medical History": "Medical",
"Medical Diagnosis": "Medical",
"Medical Skills": "Medical",
"Medical Supply": "Medical",
"Medical Gear": "Medical",
"Medical Kits": "Medical",
"Medical Logistics": "Logistics",
"Medical First Aid": "Medical",
"Medical Ethics": "Medical",
"Medical Reference Ranges": "Medical",
"Medical andSurgical Hints": "Medical",
"Medical Aspects of Radiation Injury": "Medical",
"Medical Uses": "Medical",
"Medical Care in Developing Countries": "Medical",
"Survival Medicine": "Medical",
"Emergency War Surgery": "Medical",
"First Aid": "Medical",
"First Aid and Life Saving": "Medical",
"Veterinary Medicine": "Medical",
"Veterinary Hygiene": "Medical",
"Veterinary": "Medical",
"Pharmacology": "Medical",
"Public Health": "Medical",
"Health": "Medical",
# Food Systems
"Food_Systems": "Food Systems",
"Food_systems": "Food Systems",
"food_systems": "Food Systems",
"Food Preservation": "Food Systems",
"Food Safety": "Food Systems",
"Food Security": "Food Systems",
"Food & Nutrition": "Food Systems",
"Diet & Nutrition": "Food Systems",
"Culinary Arts": "Food Systems",
"Foodprocessing": "Food Systems",
"Food": "Food Systems",
# Sustainment Systems
"Sustainment_Systems": "Sustainment Systems",
"Agriculture": "Sustainment Systems",
"Agriculture & Natural Resources": "Sustainment Systems",
"Agriculture and Natural Resources": "Sustainment Systems",
"Horticulture": "Sustainment Systems",
"Gardening": "Sustainment Systems",
"Hydroponics": "Sustainment Systems",
"Survival Skills": "Sustainment Systems",
# Foundational Skills
"Foundational_Skills": "Foundational Skills",
"Primitive Living Skills": "Foundational Skills",
"Woodcraft": "Foundational Skills",
"Home Workshop": "Foundational Skills",
"Science": "Foundational Skills",
"Engineering": "Foundational Skills",
"Construction": "Foundational Skills",
"Industrial Processes": "Foundational Skills",
"Machine Technology": "Foundational Skills",
"Training": "Foundational Skills",
"Education": "Foundational Skills",
# Off-Grid Systems
"Off-Grid_Systems": "Off-Grid Systems",
"Appropriate Technology": "Off-Grid Systems",
# Power Systems
"Homebrewed Electricity": "Power Systems",
"Renewable Energy": "Power Systems",
"Renewable Energy FAQs": "Power Systems",
"Alternative Fuels": "Power Systems",
"Power_Systems": "Power Systems",
# Water Systems
"Water_Systems": "Water Systems",
# Community Coordination
"Community_Coordination": "Community Coordination",
"Community_coordination": "Community Coordination",
"Community": "Community Coordination",
# Leadership
"Leadership & Planning": "Leadership",
"Planning": "Leadership",
"Administration": "Leadership",
"Governance": "Leadership",
"Government": "Leadership",
# Communications
"Emergency Communications": "Communications",
# Security
"Security Systems": "Security",
# Logistics
"Transportation": "Logistics",
# Scenario Playbooks
"General Preparedness": "Scenario Playbooks",
"Emergency Preparedness": "Scenario Playbooks",
"Emergency Management": "Scenario Playbooks",
"Wilderness Preparedness": "Scenario Playbooks",
"Urban Preparedness": "Scenario Playbooks",
"Winter Preparedness": "Scenario Playbooks",
# Discard (noise domains)
"Humor": None,
"Recreation": None,
"Business": None,
"Finance": None,
"Economics": None,
"Economics/Finances": None,
"Weird Science": None,
}
# Subdomain keyword → canonical domain (for solo-Reference reclassification)
SUBDOMAIN_MAP = {
"first aid": "Medical",
"emergency care": "Medical",
"emergency medicine": "Medical",
"trauma": "Medical",
"anatomy": "Medical",
"oral rehydration": "Medical",
"ors": "Medical",
"pharmacology": "Medical",
"toxicology": "Medical",
"antidote": "Medical",
"nerve agent": "Defense & Tactics",
"chemical warfare": "Defense & Tactics",
"biological warfare": "Defense & Tactics",
"nbc": "Defense & Tactics",
"infectious disease": "Medical",
"microbiology": "Medical",
"virology": "Medical",
"bacteriology": "Medical",
"pediatric": "Medical",
"surgery": "Medical",
"wound care": "Medical",
"veterinary": "Medical",
"dental": "Medical",
"dentistry": "Medical",
"herbal": "Medical",
"medicinal plant": "Medical",
"medicinal herb": "Medical",
"herbalism": "Medical",
"food preservation": "Food Systems",
"canning": "Food Systems",
"fermentation": "Food Systems",
"food storage": "Food Systems",
"food safety": "Food Systems",
"cooking": "Food Systems",
"food processing": "Food Systems",
"agriculture": "Sustainment Systems",
"soil": "Sustainment Systems",
"permaculture": "Sustainment Systems",
"agroforestry": "Sustainment Systems",
"livestock": "Sustainment Systems",
"animal husbandry": "Sustainment Systems",
"beekeeping": "Sustainment Systems",
"foraging": "Sustainment Systems",
"hunting": "Sustainment Systems",
"fishing": "Sustainment Systems",
"gardening": "Sustainment Systems",
"mycology": "Sustainment Systems",
"mushroom": "Sustainment Systems",
"water purification": "Water Systems",
"water filtration": "Water Systems",
"water sanitation": "Water Systems",
"water disinfection": "Water Systems",
"water storage": "Water Systems",
"well construction": "Water Systems",
"rainwater": "Water Systems",
"solar": "Power Systems",
"wind turbine": "Power Systems",
"battery": "Power Systems",
"batteries": "Power Systems",
"generator": "Power Systems",
"photovoltaic": "Power Systems",
"charge controller": "Power Systems",
"inverter": "Power Systems",
"biogas": "Off-Grid Systems",
"biomass": "Off-Grid Systems",
"wood gasification": "Off-Grid Systems",
"rocket stove": "Off-Grid Systems",
"mechanical system": "Off-Grid Systems",
"power transmission": "Off-Grid Systems",
"radio": "Communications",
"ham radio": "Communications",
"amateur radio": "Communications",
"antenna": "Communications",
"meshtastic": "Communications",
"encryption": "Communications",
"navigation": "Navigation",
"celestial navigation": "Navigation",
"land navigation": "Navigation",
"map reading": "Navigation",
"compass": "Navigation",
"pottery": "Foundational Skills",
"ceramics": "Foundational Skills",
"blacksmithing": "Foundational Skills",
"woodworking": "Foundational Skills",
"leatherwork": "Foundational Skills",
"textile": "Foundational Skills",
"masonry": "Foundational Skills",
"metalworking": "Foundational Skills",
"historical technology": "Foundational Skills",
"weapons": "Defense & Tactics",
"firearms": "Defense & Tactics",
"ballistics": "Defense & Tactics",
"tactics": "Defense & Tactics",
"perimeter": "Security",
"surveillance": "Security",
"supply chain": "Logistics",
"logistics": "Logistics",
"leadership": "Leadership",
"governance": "Leadership",
"community": "Community Coordination",
"emergency preparedness": "Scenario Playbooks",
"disaster": "Scenario Playbooks",
"evacuation": "Scenario Playbooks",
}
def remap_domains(domains):
"""Remap a list of domain strings — variants to canonical, strip Reference."""
result = set()
for d in domains:
if d == "Reference":
continue
if d in CANONICAL_DOMAINS:
result.add(d)
elif d in VARIANT_MAP:
mapped = VARIANT_MAP[d]
if mapped: # None means discard
result.add(mapped)
# Unknown non-canonical domains: drop them
return list(result)
def classify_by_subdomain(subdomains):
"""Try to infer canonical domain(s) from subdomain keyword matching."""
found = set()
for sd in subdomains:
sd_lower = sd.lower().strip()
for key, domain in SUBDOMAIN_MAP.items():
if key in sd_lower:
found.add(domain)
return list(found) if found else None
def process_window_file(filepath, dry_run):
"""Process one window JSON file (array of concepts). Returns per-file stats."""
stats = defaultdict(int)
unknowns = []
try:
with open(filepath, "r", encoding="utf-8") as f:
concepts = json.load(f)
except Exception as e:
return {"parse_error": 1}, []
if not isinstance(concepts, list):
return {"skip_not_list": 1}, []
modified = False
for concept in concepts:
if not isinstance(concept, dict):
continue
raw_domains = concept.get("domain", [])
if isinstance(raw_domains, str):
raw_domains = [raw_domains]
subdomains = concept.get("subdomain", [])
if isinstance(subdomains, str):
subdomains = [subdomains]
has_reference = "Reference" in raw_domains
non_reference = [d for d in raw_domains if d != "Reference"]
if not has_reference:
# No Reference — just fix any variant names
remapped = remap_domains(raw_domains)
if set(remapped) != set(raw_domains):
concept["domain"] = remapped
modified = True
stats["variant_remapped"] += 1
else:
stats["no_change"] += 1
continue
# Has Reference — what else does it have?
remapped_others = remap_domains(non_reference)
if remapped_others:
# Reference + real domains: drop Reference, keep the rest
concept["domain"] = remapped_others
modified = True
stats["reference_stripped"] += 1
continue
# Solo Reference (or Reference + only-noise): try subdomain lookup
inferred = classify_by_subdomain(subdomains)
if inferred:
concept["domain"] = inferred
concept["_reclassified_from_reference"] = True
modified = True
stats["subdomain_reclassified"] += 1
continue
# True unknown — needs API re-enrichment
unknowns.append({
"filepath": str(filepath),
"title": concept.get("title", ""),
"subdomain": subdomains,
"content_preview": str(concept.get("content", concept.get("summary", "")))[:300],
})
stats["needs_enrichment"] += 1
if modified and not dry_run:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(concepts, f, indent=2, ensure_ascii=False)
return dict(stats), unknowns
def main():
parser = argparse.ArgumentParser(description="Remap RECON concept domains")
parser.add_argument("--dry-run", action="store_true", help="Report without writing")
parser.add_argument("--workers", type=int, default=16)
args = parser.parse_args()
print(f"[REMAP] Scanning {CONCEPTS_DIR}")
print(f"[REMAP] Dry run: {args.dry_run} | Workers: {args.workers}")
window_files = [
f for f in CONCEPTS_DIR.rglob("window_*.json")
]
print(f"[REMAP] Found {len(window_files):,} window files")
total_stats = defaultdict(int)
all_unknowns = []
lock = threading.Lock()
done = 0
with ThreadPoolExecutor(max_workers=args.workers) as ex:
futures = {ex.submit(process_window_file, f, args.dry_run): f for f in window_files}
for future in as_completed(futures):
file_stats, unknowns = future.result()
with lock:
for k, v in file_stats.items():
total_stats[k] += v
all_unknowns.extend(unknowns)
done += 1
if done % 5000 == 0:
print(f" {done:,}/{len(window_files):,} files processed...")
print("\n── Results ─────────────────────────────────────────────────")
for status, count in sorted(total_stats.items(), key=lambda x: -x[1]):
print(f" {status:<35} {count:>10,}")
total_concepts = sum(total_stats.values())
print(f"\n Total concepts processed: {total_concepts:>10,}")
print(f" True unknowns for re-enrichment:{len(all_unknowns):>10,}")
if not args.dry_run and all_unknowns:
with open(UNKNOWNS_OUTPUT, "w", encoding="utf-8") as f:
for item in all_unknowns:
f.write(json.dumps(item) + "\n")
print(f"\n Unknowns written to: {UNKNOWNS_OUTPUT}")
if args.dry_run:
print("\n [DRY RUN] No files were modified.")
if __name__ == "__main__":
main()