#!/usr/bin/env python3 """ cleanup_outliers.py — Three-pass cleanup of RECON concept data. Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB Usage: python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N] """ import json import time import random import logging import argparse import threading import sqlite3 from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed from collections import defaultdict import google.generativeai as genai from qdrant_client import QdrantClient from qdrant_client.models import FieldCondition, MatchAny, Filter import sys, os sys.path.insert(0, '/opt/recon') from lib.utils import get_config, setup_logging LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()] ) log = logging.getLogger("cleanup_outliers") CONCEPTS_DIR = Path("/opt/recon/data/concepts") DB_PATH = Path("/opt/recon/data/recon.db") CANONICAL_DOMAINS = { "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems", "Foundational Skills", "Communications", "Medical", "Food Systems", "Navigation", "Logistics", "Power Systems", "Leadership", "Scenario Playbooks", "Water Systems", "Security", "Community Coordination" } # Non-canonical → canonical remap OUTLIER_MAP = { "Zoology": "Sustainment Systems", "Botany": "Sustainment Systems", "Nature Lore": "Sustainment Systems", "Ecology": "Sustainment Systems", "Navigational Astronomy": "Navigation", "Troubleshooting": "Foundational Skills", "Chemistry": "Foundational Skills", "Metallurgy": "Foundational Skills", "Weird Science": "Foundational Skills", "Philosophy of physics": "Foundational Skills", "Physics": "Foundational Skills", "Cell biology": "Foundational Skills", "Economics": "Leadership", "Business": "Leadership", "Safety": "Security", "Law Enforcement": "Security", "Security & Intelligence": "Security", "Fire Weather": "Scenario Playbooks", "Legal": "Leadership", # Discard — replace with closest real domain "Site News": "Foundational Skills", "Paleogeography": "Foundational Skills", "Chemical Manipulation": "Foundational Skills", } # Junk URL patterns — pages with no knowledge value JUNK_URL_PATTERNS = [ # rocketstoves.com nav/template garbage "rocketstoves.com/favicon", "rocketstoves.com/cropped-favicon", "rocketstoves.com/layouts/", "rocketstoves.com/sample", "rocketstoves.com/templates/", "rocketstoves.com/hello-world", "rocketstoves.com/blog-forthcoming", "rocketstoves.com/contact", "rocketstoves.com/acknowledgements", "rocketstoves.com/ja3", "rocketstoves.com/juxtapositions", "rocketstoves.com/no-name-soi", "rocketstoves.com/big4", "rocketstoves.com/roof", "rocketstoves.com/rmh_dloadcover", "rocketstoves.com/pedcover", "rocketstoves.com/laundry-to-landscape", "rocketstoves.com/barreloven", # NRCS calendar/event noise "nrcs.usda.gov/events/", "nrcs.usda.gov/state-offices/massachusetts", "nrcs.usda.gov/state-offices/nebraska", "nrcs.usda.gov/state-offices/oklahoma", "nrcs.usda.gov/state-offices/utah", "nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts", # deeranddeerhunting trophy hunt videos (no knowledge value) "deeranddeerhunting.com/trophy-whitetails-exclusive-videos/", # eattheweeds non-content pages "eattheweeds.com/media-interviews-with-green-deane", "eattheweeds.com/motorcycles-and-mushrooms", "eattheweeds.com/sunny-savage", # foragersharvest nav pages "foragersharvest.com/contact", "foragersharvest.com/podcasts", # motherearthnews classifieds/nav "motherearthnews.com/classifieds/", "motherearthnews.com/biographies/", ] CLASSIFY_PROMPT = """\ Classify this knowledge concept into one or more domains. VALID DOMAINS (use ONLY these exact strings): Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills, Communications, Medical, Food Systems, Navigation, Logistics, Power Systems, Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination Concept title: {title} Concept tags: {subdomain} Concept preview: {content} Return ONLY valid JSON, no markdown: {{"domain": ["Domain Name"]}} Rules: - Never return empty domain list - Medical content, herbs, first aid, veterinary → Medical - Food growing, foraging, hunting, livestock → Sustainment Systems - Food preservation, canning, storage → Food Systems - Solar, wind, batteries, generators → Power Systems - Water sourcing, filtration, sanitation → Water Systems """ def load_gemini_keys(): keys = [] for line in Path("/opt/recon/.env").read_text().splitlines(): if line.startswith("GEMINI_KEY_"): keys.append(line.split("=", 1)[1].strip()) return keys class KeyRotator: def __init__(self, keys): self.keys = keys self._i = 0 self._lock = threading.Lock() def next(self): with self._lock: key = self.keys[self._i % len(self.keys)] self._i += 1 return key def classify_concept(title, subdomains, content, key): prompt = CLASSIFY_PROMPT.format( title=title or "(untitled)", subdomain=", ".join(subdomains[:10]) if subdomains else "(none)", content=str(content)[:300] if content else "(none)", ) genai.configure(api_key=key) model = genai.GenerativeModel( "gemini-2.0-flash", generation_config={"response_mime_type": "application/json"} ) for attempt in range(4): try: resp = model.generate_content(prompt) data = json.loads(resp.text) domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS] if domains: return domains except Exception as e: err = str(e).lower() if any(s in err for s in ["429", "quota", "rate", "503"]): time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60)) else: break return ["Foundational Skills"] # ── PASS 1: Remap outlier domains ──────────────────────────────────────────── def remap_concept_domains(domains): """Remap any outlier domain names in a domain list.""" result = set() changed = False for d in domains: if d in CANONICAL_DOMAINS: result.add(d) elif d in OUTLIER_MAP: result.add(OUTLIER_MAP[d]) changed = True else: changed = True # drop unknown return list(result), changed def pass1_remap_outliers(qdrant, collection, dry_run): log.info("=== PASS 1: Remapping non-canonical outlier domains ===") outlier_names = list(OUTLIER_MAP.keys()) stats = defaultdict(int) # Scroll through Qdrant finding affected vectors offset = None affected_points = [] while True: results, offset = qdrant.scroll( collection_name=collection, scroll_filter=Filter( must=[FieldCondition( key="domain", match=MatchAny(any=outlier_names) )] ), limit=500, with_payload=True, with_vectors=False, offset=offset, ) affected_points.extend(results) if offset is None: break log.info(f"Found {len(affected_points)} Qdrant points with outlier domains") for point in affected_points: payload = point.payload old_domains = payload.get("domain", []) if isinstance(old_domains, str): old_domains = [old_domains] new_domains, changed = remap_concept_domains(old_domains) if not new_domains: new_domains = ["Foundational Skills"] if changed: stats["qdrant_updated"] += 1 if not dry_run: qdrant.set_payload( collection_name=collection, payload={"domain": new_domains}, points=[point.id], ) # Also fix concept JSON files on disk json_fixed = 0 for window_file in CONCEPTS_DIR.rglob("window_*.json"): try: with open(window_file, "r", encoding="utf-8") as f: concepts = json.load(f) except Exception: continue if not isinstance(concepts, list): continue file_changed = False for concept in concepts: if not isinstance(concept, dict): continue raw = concept.get("domain", []) if isinstance(raw, str): raw = [raw] new, changed = remap_concept_domains(raw) if changed: concept["domain"] = new if new else ["Foundational Skills"] file_changed = True if file_changed: json_fixed += 1 if not dry_run: with open(window_file, "w", encoding="utf-8") as f: json.dump(concepts, f, indent=2, ensure_ascii=False) log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated") return stats # ── PASS 2: Re-enrich empty domain concepts ────────────────────────────────── def pass2_empty_domains(qdrant, collection, key_rotator, dry_run): log.info("=== PASS 2: Re-enriching empty domain concepts ===") stats = defaultdict(int) # Find empty domain points in Qdrant offset = None empty_points = [] while True: results, offset = qdrant.scroll( collection_name=collection, limit=500, with_payload=True, with_vectors=False, offset=offset, ) for r in results: d = r.payload.get("domain", []) if not d or d == [] or d == [""]: empty_points.append(r) if offset is None: break log.info(f"Found {len(empty_points)} points with empty domains") for point in empty_points: payload = point.payload title = payload.get("title", "") subdomains = payload.get("subdomain", []) content = payload.get("content", payload.get("summary", "")) key = key_rotator.next() new_domains = classify_concept(title, subdomains, content, key) stats["classified"] += 1 if not dry_run: qdrant.set_payload( collection_name=collection, payload={"domain": new_domains}, points=[point.id], ) # Also update the concept JSON on disk doc_hash = payload.get("doc_hash", "") if doc_hash: doc_concepts_dir = CONCEPTS_DIR / doc_hash if doc_concepts_dir.exists(): for wf in doc_concepts_dir.glob("window_*.json"): try: with open(wf, "r", encoding="utf-8") as f: concepts = json.load(f) changed = False for c in concepts: if isinstance(c, dict) and c.get("title") == title: d = c.get("domain", []) if not d or d == []: c["domain"] = new_domains changed = True if changed and not dry_run: with open(wf, "w", encoding="utf-8") as f: json.dump(concepts, f, indent=2, ensure_ascii=False) except Exception: pass time.sleep(0.05) log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified") return stats # ── PASS 3: Purge junk URLs ────────────────────────────────────────────────── def is_junk_url(url): url_lower = url.lower() return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS) def pass3_purge_junk(qdrant, collection, dry_run): log.info("=== PASS 3: Purging junk URLs ===") stats = defaultdict(int) # Scroll all web-source points and find junk offset = None junk_point_ids = [] junk_doc_hashes = set() while True: results, offset = qdrant.scroll( collection_name=collection, scroll_filter=Filter( must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))] ), limit=500, with_payload=True, with_vectors=False, offset=offset, ) for r in results: filename = r.payload.get("filename", "") doc_hash = r.payload.get("doc_hash", "") if is_junk_url(filename): junk_point_ids.append(r.id) if doc_hash: junk_doc_hashes.add(doc_hash) if offset is None: break log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents") if not dry_run and junk_point_ids: # Delete in batches batch_size = 500 for i in range(0, len(junk_point_ids), batch_size): batch = junk_point_ids[i:i + batch_size] qdrant.delete(collection_name=collection, points_selector=batch) log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant") # Mark junk docs as skipped in SQLite conn = sqlite3.connect(str(DB_PATH)) for doc_hash in junk_doc_hashes: conn.execute( "UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?", (doc_hash,) ) conn.commit() conn.close() log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB") stats["junk_vectors"] = len(junk_point_ids) stats["junk_docs"] = len(junk_doc_hashes) log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged") return stats def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") parser.add_argument("--skip-pass", type=int, action="append", default=[]) args = parser.parse_args() config = get_config() keys = load_gemini_keys() rotator = KeyRotator(keys) qdrant = QdrantClient( host=config['vector_db']['host'], port=config['vector_db']['port'], timeout=60 ) collection = config['vector_db']['collection'] log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}") if 1 not in args.skip_pass: pass1_remap_outliers(qdrant, collection, args.dry_run) if 2 not in args.skip_pass: pass2_empty_domains(qdrant, collection, rotator, args.dry_run) if 3 not in args.skip_pass: pass3_purge_junk(qdrant, collection, args.dry_run) log.info("All passes complete.") if __name__ == "__main__": main()