recon/scripts/cleanup_outliers.py

#!/usr/bin/env python3
"""
cleanup_outliers.py — Three-pass cleanup of RECON concept data.

Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB

Usage:
  python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
"""

import json
import time
import random
import logging
import argparse
import threading
import sqlite3
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import FieldCondition, MatchAny, Filter

import sys, os
sys.path.insert(0, '/opt/recon')
from lib.utils import get_config, setup_logging

LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("cleanup_outliers")

CONCEPTS_DIR = Path("/opt/recon/data/concepts")
DB_PATH = Path("/opt/recon/data/recon.db")

CANONICAL_DOMAINS = {
    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
    "Foundational Skills", "Communications", "Medical", "Food Systems",
    "Navigation", "Logistics", "Power Systems", "Leadership",
    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
}

# Non-canonical → canonical remap
OUTLIER_MAP = {
    "Zoology":                  "Sustainment Systems",
    "Botany":                   "Sustainment Systems",
    "Nature Lore":              "Sustainment Systems",
    "Ecology":                  "Sustainment Systems",
    "Navigational Astronomy":   "Navigation",
    "Troubleshooting":          "Foundational Skills",
    "Chemistry":                "Foundational Skills",
    "Metallurgy":               "Foundational Skills",
    "Weird Science":            "Foundational Skills",
    "Philosophy of physics":    "Foundational Skills",
    "Physics":                  "Foundational Skills",
    "Cell biology":             "Foundational Skills",
    "Economics":                "Leadership",
    "Business":                 "Leadership",
    "Safety":                   "Security",
    "Law Enforcement":          "Security",
    "Security & Intelligence":  "Security",
    "Fire Weather":             "Scenario Playbooks",
    "Legal":                    "Leadership",
    # Discard — replace with closest real domain
    "Site News":                "Foundational Skills",
    "Paleogeography":           "Foundational Skills",
    "Chemical Manipulation":    "Foundational Skills",
}

# Junk URL patterns — pages with no knowledge value
JUNK_URL_PATTERNS = [
    # rocketstoves.com nav/template garbage
    "rocketstoves.com/favicon",
    "rocketstoves.com/cropped-favicon",
    "rocketstoves.com/layouts/",
    "rocketstoves.com/sample",
    "rocketstoves.com/templates/",
    "rocketstoves.com/hello-world",
    "rocketstoves.com/blog-forthcoming",
    "rocketstoves.com/contact",
    "rocketstoves.com/acknowledgements",
    "rocketstoves.com/ja3",
    "rocketstoves.com/juxtapositions",
    "rocketstoves.com/no-name-soi",
    "rocketstoves.com/big4",
    "rocketstoves.com/roof",
    "rocketstoves.com/rmh_dloadcover",
    "rocketstoves.com/pedcover",
    "rocketstoves.com/laundry-to-landscape",
    "rocketstoves.com/barreloven",
    # NRCS calendar/event noise
    "nrcs.usda.gov/events/",
    "nrcs.usda.gov/state-offices/massachusetts",
    "nrcs.usda.gov/state-offices/nebraska",
    "nrcs.usda.gov/state-offices/oklahoma",
    "nrcs.usda.gov/state-offices/utah",
    "nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
    # deeranddeerhunting trophy hunt videos (no knowledge value)
    "deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
    # eattheweeds non-content pages
    "eattheweeds.com/media-interviews-with-green-deane",
    "eattheweeds.com/motorcycles-and-mushrooms",
    "eattheweeds.com/sunny-savage",
    # foragersharvest nav pages
    "foragersharvest.com/contact",
    "foragersharvest.com/podcasts",
    # motherearthnews classifieds/nav
    "motherearthnews.com/classifieds/",
    "motherearthnews.com/biographies/",
]

CLASSIFY_PROMPT = """\
Classify this knowledge concept into one or more domains.

VALID DOMAINS (use ONLY these exact strings):
  Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
  Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
  Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination

Concept title: {title}
Concept tags: {subdomain}
Concept preview: {content}

Return ONLY valid JSON, no markdown:
{{"domain": ["Domain Name"]}}

Rules:
- Never return empty domain list
- Medical content, herbs, first aid, veterinary → Medical
- Food growing, foraging, hunting, livestock → Sustainment Systems
- Food preservation, canning, storage → Food Systems
- Solar, wind, batteries, generators → Power Systems
- Water sourcing, filtration, sanitation → Water Systems
"""

def load_gemini_keys():
    keys = []
    for line in Path("/opt/recon/.env").read_text().splitlines():
        if line.startswith("GEMINI_KEY_"):
            keys.append(line.split("=", 1)[1].strip())
    return keys

class KeyRotator:
    def __init__(self, keys):
        self.keys = keys
        self._i = 0
        self._lock = threading.Lock()
    def next(self):
        with self._lock:
            key = self.keys[self._i % len(self.keys)]
            self._i += 1
            return key

def classify_concept(title, subdomains, content, key):
    prompt = CLASSIFY_PROMPT.format(
        title=title or "(untitled)",
        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
        content=str(content)[:300] if content else "(none)",
    )
    genai.configure(api_key=key)
    model = genai.GenerativeModel(
        "gemini-2.0-flash",
        generation_config={"response_mime_type": "application/json"}
    )
    for attempt in range(4):
        try:
            resp = model.generate_content(prompt)
            data = json.loads(resp.text)
            domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
            if domains:
                return domains
        except Exception as e:
            err = str(e).lower()
            if any(s in err for s in ["429", "quota", "rate", "503"]):
                time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
            else:
                break
    return ["Foundational Skills"]

# ── PASS 1: Remap outlier domains ────────────────────────────────────────────

def remap_concept_domains(domains):
    """Remap any outlier domain names in a domain list."""
    result = set()
    changed = False
    for d in domains:
        if d in CANONICAL_DOMAINS:
            result.add(d)
        elif d in OUTLIER_MAP:
            result.add(OUTLIER_MAP[d])
            changed = True
        else:
            changed = True  # drop unknown
    return list(result), changed

def pass1_remap_outliers(qdrant, collection, dry_run):
    log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
    outlier_names = list(OUTLIER_MAP.keys())
    stats = defaultdict(int)

    # Scroll through Qdrant finding affected vectors
    offset = None
    affected_points = []

    while True:
        results, offset = qdrant.scroll(
            collection_name=collection,
            scroll_filter=Filter(
                must=[FieldCondition(
                    key="domain",
                    match=MatchAny(any=outlier_names)
                )]
            ),
            limit=500,
            with_payload=True,
            with_vectors=False,
            offset=offset,
        )
        affected_points.extend(results)
        if offset is None:
            break

    log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")

    for point in affected_points:
        payload = point.payload
        old_domains = payload.get("domain", [])
        if isinstance(old_domains, str):
            old_domains = [old_domains]

        new_domains, changed = remap_concept_domains(old_domains)
        if not new_domains:
            new_domains = ["Foundational Skills"]

        if changed:
            stats["qdrant_updated"] += 1
            if not dry_run:
                qdrant.set_payload(
                    collection_name=collection,
                    payload={"domain": new_domains},
                    points=[point.id],
                )

    # Also fix concept JSON files on disk
    json_fixed = 0
    for window_file in CONCEPTS_DIR.rglob("window_*.json"):
        try:
            with open(window_file, "r", encoding="utf-8") as f:
                concepts = json.load(f)
        except Exception:
            continue

        if not isinstance(concepts, list):
            continue

        file_changed = False
        for concept in concepts:
            if not isinstance(concept, dict):
                continue
            raw = concept.get("domain", [])
            if isinstance(raw, str):
                raw = [raw]
            new, changed = remap_concept_domains(raw)
            if changed:
                concept["domain"] = new if new else ["Foundational Skills"]
                file_changed = True

        if file_changed:
            json_fixed += 1
            if not dry_run:
                with open(window_file, "w", encoding="utf-8") as f:
                    json.dump(concepts, f, indent=2, ensure_ascii=False)

    log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
    return stats

# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────

def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
    log.info("=== PASS 2: Re-enriching empty domain concepts ===")
    stats = defaultdict(int)

    # Find empty domain points in Qdrant
    offset = None
    empty_points = []
    while True:
        results, offset = qdrant.scroll(
            collection_name=collection,
            limit=500,
            with_payload=True,
            with_vectors=False,
            offset=offset,
        )
        for r in results:
            d = r.payload.get("domain", [])
            if not d or d == [] or d == [""]:
                empty_points.append(r)
        if offset is None:
            break

    log.info(f"Found {len(empty_points)} points with empty domains")

    for point in empty_points:
        payload = point.payload
        title = payload.get("title", "")
        subdomains = payload.get("subdomain", [])
        content = payload.get("content", payload.get("summary", ""))

        key = key_rotator.next()
        new_domains = classify_concept(title, subdomains, content, key)
        stats["classified"] += 1

        if not dry_run:
            qdrant.set_payload(
                collection_name=collection,
                payload={"domain": new_domains},
                points=[point.id],
            )

        # Also update the concept JSON on disk
        doc_hash = payload.get("doc_hash", "")
        if doc_hash:
            doc_concepts_dir = CONCEPTS_DIR / doc_hash
            if doc_concepts_dir.exists():
                for wf in doc_concepts_dir.glob("window_*.json"):
                    try:
                        with open(wf, "r", encoding="utf-8") as f:
                            concepts = json.load(f)
                        changed = False
                        for c in concepts:
                            if isinstance(c, dict) and c.get("title") == title:
                                d = c.get("domain", [])
                                if not d or d == []:
                                    c["domain"] = new_domains
                                    changed = True
                        if changed and not dry_run:
                            with open(wf, "w", encoding="utf-8") as f:
                                json.dump(concepts, f, indent=2, ensure_ascii=False)
                    except Exception:
                        pass

        time.sleep(0.05)

    log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
    return stats

# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────

def is_junk_url(url):
    url_lower = url.lower()
    return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)

def pass3_purge_junk(qdrant, collection, dry_run):
    log.info("=== PASS 3: Purging junk URLs ===")
    stats = defaultdict(int)

    # Scroll all web-source points and find junk
    offset = None
    junk_point_ids = []
    junk_doc_hashes = set()

    while True:
        results, offset = qdrant.scroll(
            collection_name=collection,
            scroll_filter=Filter(
                must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
            ),
            limit=500,
            with_payload=True,
            with_vectors=False,
            offset=offset,
        )
        for r in results:
            filename = r.payload.get("filename", "")
            doc_hash = r.payload.get("doc_hash", "")
            if is_junk_url(filename):
                junk_point_ids.append(r.id)
                if doc_hash:
                    junk_doc_hashes.add(doc_hash)
        if offset is None:
            break

    log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")

    if not dry_run and junk_point_ids:
        # Delete in batches
        batch_size = 500
        for i in range(0, len(junk_point_ids), batch_size):
            batch = junk_point_ids[i:i + batch_size]
            qdrant.delete(collection_name=collection, points_selector=batch)
        log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")

        # Mark junk docs as skipped in SQLite
        conn = sqlite3.connect(str(DB_PATH))
        for doc_hash in junk_doc_hashes:
            conn.execute(
                "UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
                (doc_hash,)
            )
        conn.commit()
        conn.close()
        log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")

    stats["junk_vectors"] = len(junk_point_ids)
    stats["junk_docs"] = len(junk_doc_hashes)
    log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
    return stats


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--skip-pass", type=int, action="append", default=[])
    args = parser.parse_args()

    config = get_config()
    keys = load_gemini_keys()
    rotator = KeyRotator(keys)

    qdrant = QdrantClient(
        host=config['vector_db']['host'],
        port=config['vector_db']['port'],
        timeout=60
    )
    collection = config['vector_db']['collection']

    log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")

    if 1 not in args.skip_pass:
        pass1_remap_outliers(qdrant, collection, args.dry_run)

    if 2 not in args.skip_pass:
        pass2_empty_domains(qdrant, collection, rotator, args.dry_run)

    if 3 not in args.skip_pass:
        pass3_purge_junk(qdrant, collection, args.dry_run)

    log.info("All passes complete.")


if __name__ == "__main__":
    main()
Initial commit: RECON codebase baseline Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-04-14 14:57:23 +00:00			`#!/usr/bin/env python3`
			`"""`
			`cleanup_outliers.py — Three-pass cleanup of RECON concept data.`

			`Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads`
			`Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini`
			`Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB`

			`Usage:`
			`python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]`
			`"""`

			`import json`
			`import time`
			`import random`
			`import logging`
			`import argparse`
			`import threading`
			`import sqlite3`
			`from pathlib import Path`
			`from concurrent.futures import ThreadPoolExecutor, as_completed`
			`from collections import defaultdict`

			`import google.generativeai as genai`
			`from qdrant_client import QdrantClient`
			`from qdrant_client.models import FieldCondition, MatchAny, Filter`

			`import sys, os`
			`sys.path.insert(0, '/opt/recon')`
			`from lib.utils import get_config, setup_logging`

			`LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format="%(asctime)s %(levelname)s %(message)s",`
			`handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]`
			`)`
			`log = logging.getLogger("cleanup_outliers")`

			`CONCEPTS_DIR = Path("/opt/recon/data/concepts")`
			`DB_PATH = Path("/opt/recon/data/recon.db")`

			`CANONICAL_DOMAINS = {`
			`"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",`
			`"Foundational Skills", "Communications", "Medical", "Food Systems",`
			`"Navigation", "Logistics", "Power Systems", "Leadership",`
			`"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"`
			`}`

			`# Non-canonical → canonical remap`
			`OUTLIER_MAP = {`
			`"Zoology": "Sustainment Systems",`
			`"Botany": "Sustainment Systems",`
			`"Nature Lore": "Sustainment Systems",`
			`"Ecology": "Sustainment Systems",`
			`"Navigational Astronomy": "Navigation",`
			`"Troubleshooting": "Foundational Skills",`
			`"Chemistry": "Foundational Skills",`
			`"Metallurgy": "Foundational Skills",`
			`"Weird Science": "Foundational Skills",`
			`"Philosophy of physics": "Foundational Skills",`
			`"Physics": "Foundational Skills",`
			`"Cell biology": "Foundational Skills",`
			`"Economics": "Leadership",`
			`"Business": "Leadership",`
			`"Safety": "Security",`
			`"Law Enforcement": "Security",`
			`"Security & Intelligence": "Security",`
			`"Fire Weather": "Scenario Playbooks",`
			`"Legal": "Leadership",`
			`# Discard — replace with closest real domain`
			`"Site News": "Foundational Skills",`
			`"Paleogeography": "Foundational Skills",`
			`"Chemical Manipulation": "Foundational Skills",`
			`}`

			`# Junk URL patterns — pages with no knowledge value`
			`JUNK_URL_PATTERNS = [`
			`# rocketstoves.com nav/template garbage`
			`"rocketstoves.com/favicon",`
			`"rocketstoves.com/cropped-favicon",`
			`"rocketstoves.com/layouts/",`
			`"rocketstoves.com/sample",`
			`"rocketstoves.com/templates/",`
			`"rocketstoves.com/hello-world",`
			`"rocketstoves.com/blog-forthcoming",`
			`"rocketstoves.com/contact",`
			`"rocketstoves.com/acknowledgements",`
			`"rocketstoves.com/ja3",`
			`"rocketstoves.com/juxtapositions",`
			`"rocketstoves.com/no-name-soi",`
			`"rocketstoves.com/big4",`
			`"rocketstoves.com/roof",`
			`"rocketstoves.com/rmh_dloadcover",`
			`"rocketstoves.com/pedcover",`
			`"rocketstoves.com/laundry-to-landscape",`
			`"rocketstoves.com/barreloven",`
			`# NRCS calendar/event noise`
			`"nrcs.usda.gov/events/",`
			`"nrcs.usda.gov/state-offices/massachusetts",`
			`"nrcs.usda.gov/state-offices/nebraska",`
			`"nrcs.usda.gov/state-offices/oklahoma",`
			`"nrcs.usda.gov/state-offices/utah",`
			`"nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",`
			`# deeranddeerhunting trophy hunt videos (no knowledge value)`
			`"deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",`
			`# eattheweeds non-content pages`
			`"eattheweeds.com/media-interviews-with-green-deane",`
			`"eattheweeds.com/motorcycles-and-mushrooms",`
			`"eattheweeds.com/sunny-savage",`
			`# foragersharvest nav pages`
			`"foragersharvest.com/contact",`
			`"foragersharvest.com/podcasts",`
			`# motherearthnews classifieds/nav`
			`"motherearthnews.com/classifieds/",`
			`"motherearthnews.com/biographies/",`
			`]`

			`CLASSIFY_PROMPT = """\`
			`Classify this knowledge concept into one or more domains.`

			`VALID DOMAINS (use ONLY these exact strings):`
			`Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,`
			`Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,`
			`Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination`

			`Concept title: {title}`
			`Concept tags: {subdomain}`
			`Concept preview: {content}`

			`Return ONLY valid JSON, no markdown:`
			`{{"domain": ["Domain Name"]}}`

			`Rules:`
			`- Never return empty domain list`
			`- Medical content, herbs, first aid, veterinary → Medical`
			`- Food growing, foraging, hunting, livestock → Sustainment Systems`
			`- Food preservation, canning, storage → Food Systems`
			`- Solar, wind, batteries, generators → Power Systems`
			`- Water sourcing, filtration, sanitation → Water Systems`
			`"""`

			`def load_gemini_keys():`
			`keys = []`
			`for line in Path("/opt/recon/.env").read_text().splitlines():`
			`if line.startswith("GEMINI_KEY_"):`
			`keys.append(line.split("=", 1)[1].strip())`
			`return keys`

			`class KeyRotator:`
			`def __init__(self, keys):`
			`self.keys = keys`
			`self._i = 0`
			`self._lock = threading.Lock()`
			`def next(self):`
			`with self._lock:`
			`key = self.keys[self._i % len(self.keys)]`
			`self._i += 1`
			`return key`

			`def classify_concept(title, subdomains, content, key):`
			`prompt = CLASSIFY_PROMPT.format(`
			`title=title or "(untitled)",`
			`subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",`
			`content=str(content)[:300] if content else "(none)",`
			`)`
			`genai.configure(api_key=key)`
			`model = genai.GenerativeModel(`
			`"gemini-2.0-flash",`
			`generation_config={"response_mime_type": "application/json"}`
			`)`
			`for attempt in range(4):`
			`try:`
			`resp = model.generate_content(prompt)`
			`data = json.loads(resp.text)`
			`domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]`
			`if domains:`
			`return domains`
			`except Exception as e:`
			`err = str(e).lower()`
			`if any(s in err for s in ["429", "quota", "rate", "503"]):`
			`time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))`
			`else:`
			`break`
			`return ["Foundational Skills"]`

			`# ── PASS 1: Remap outlier domains ────────────────────────────────────────────`

			`def remap_concept_domains(domains):`
			`"""Remap any outlier domain names in a domain list."""`
			`result = set()`
			`changed = False`
			`for d in domains:`
			`if d in CANONICAL_DOMAINS:`
			`result.add(d)`
			`elif d in OUTLIER_MAP:`
			`result.add(OUTLIER_MAP[d])`
			`changed = True`
			`else:`
			`changed = True # drop unknown`
			`return list(result), changed`

			`def pass1_remap_outliers(qdrant, collection, dry_run):`
			`log.info("=== PASS 1: Remapping non-canonical outlier domains ===")`
			`outlier_names = list(OUTLIER_MAP.keys())`
			`stats = defaultdict(int)`

			`# Scroll through Qdrant finding affected vectors`
			`offset = None`
			`affected_points = []`

			`while True:`
			`results, offset = qdrant.scroll(`
			`collection_name=collection,`
			`scroll_filter=Filter(`
			`must=[FieldCondition(`
			`key="domain",`
			`match=MatchAny(any=outlier_names)`
			`)]`
			`),`
			`limit=500,`
			`with_payload=True,`
			`with_vectors=False,`
			`offset=offset,`
			`)`
			`affected_points.extend(results)`
			`if offset is None:`
			`break`

			`log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")`

			`for point in affected_points:`
			`payload = point.payload`
			`old_domains = payload.get("domain", [])`
			`if isinstance(old_domains, str):`
			`old_domains = [old_domains]`

			`new_domains, changed = remap_concept_domains(old_domains)`
			`if not new_domains:`
			`new_domains = ["Foundational Skills"]`

			`if changed:`
			`stats["qdrant_updated"] += 1`
			`if not dry_run:`
			`qdrant.set_payload(`
			`collection_name=collection,`
			`payload={"domain": new_domains},`
			`points=[point.id],`
			`)`

			`# Also fix concept JSON files on disk`
			`json_fixed = 0`
			`for window_file in CONCEPTS_DIR.rglob("window_*.json"):`
			`try:`
			`with open(window_file, "r", encoding="utf-8") as f:`
			`concepts = json.load(f)`
			`except Exception:`
			`continue`

			`if not isinstance(concepts, list):`
			`continue`

			`file_changed = False`
			`for concept in concepts:`
			`if not isinstance(concept, dict):`
			`continue`
			`raw = concept.get("domain", [])`
			`if isinstance(raw, str):`
			`raw = [raw]`
			`new, changed = remap_concept_domains(raw)`
			`if changed:`
			`concept["domain"] = new if new else ["Foundational Skills"]`
			`file_changed = True`

			`if file_changed:`
			`json_fixed += 1`
			`if not dry_run:`
			`with open(window_file, "w", encoding="utf-8") as f:`
			`json.dump(concepts, f, indent=2, ensure_ascii=False)`

			`log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")`
			`return stats`

			`# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────`

			`def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):`
			`log.info("=== PASS 2: Re-enriching empty domain concepts ===")`
			`stats = defaultdict(int)`

			`# Find empty domain points in Qdrant`
			`offset = None`
			`empty_points = []`
			`while True:`
			`results, offset = qdrant.scroll(`
			`collection_name=collection,`
			`limit=500,`
			`with_payload=True,`
			`with_vectors=False,`
			`offset=offset,`
			`)`
			`for r in results:`
			`d = r.payload.get("domain", [])`
			`if not d or d == [] or d == [""]:`
			`empty_points.append(r)`
			`if offset is None:`
			`break`

			`log.info(f"Found {len(empty_points)} points with empty domains")`

			`for point in empty_points:`
			`payload = point.payload`
			`title = payload.get("title", "")`
			`subdomains = payload.get("subdomain", [])`
			`content = payload.get("content", payload.get("summary", ""))`

			`key = key_rotator.next()`
			`new_domains = classify_concept(title, subdomains, content, key)`
			`stats["classified"] += 1`

			`if not dry_run:`
			`qdrant.set_payload(`
			`collection_name=collection,`
			`payload={"domain": new_domains},`
			`points=[point.id],`
			`)`

			`# Also update the concept JSON on disk`
			`doc_hash = payload.get("doc_hash", "")`
			`if doc_hash:`
			`doc_concepts_dir = CONCEPTS_DIR / doc_hash`
			`if doc_concepts_dir.exists():`
			`for wf in doc_concepts_dir.glob("window_*.json"):`
			`try:`
			`with open(wf, "r", encoding="utf-8") as f:`
			`concepts = json.load(f)`
			`changed = False`
			`for c in concepts:`
			`if isinstance(c, dict) and c.get("title") == title:`
			`d = c.get("domain", [])`
			`if not d or d == []:`
			`c["domain"] = new_domains`
			`changed = True`
			`if changed and not dry_run:`
			`with open(wf, "w", encoding="utf-8") as f:`
			`json.dump(concepts, f, indent=2, ensure_ascii=False)`
			`except Exception:`
			`pass`

			`time.sleep(0.05)`

			`log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")`
			`return stats`

			`# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────`

			`def is_junk_url(url):`
			`url_lower = url.lower()`
			`return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)`

			`def pass3_purge_junk(qdrant, collection, dry_run):`
			`log.info("=== PASS 3: Purging junk URLs ===")`
			`stats = defaultdict(int)`

			`# Scroll all web-source points and find junk`
			`offset = None`
			`junk_point_ids = []`
			`junk_doc_hashes = set()`

			`while True:`
			`results, offset = qdrant.scroll(`
			`collection_name=collection,`
			`scroll_filter=Filter(`
			`must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]`
			`),`
			`limit=500,`
			`with_payload=True,`
			`with_vectors=False,`
			`offset=offset,`
			`)`
			`for r in results:`
			`filename = r.payload.get("filename", "")`
			`doc_hash = r.payload.get("doc_hash", "")`
			`if is_junk_url(filename):`
			`junk_point_ids.append(r.id)`
			`if doc_hash:`
			`junk_doc_hashes.add(doc_hash)`
			`if offset is None:`
			`break`

			`log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")`

			`if not dry_run and junk_point_ids:`
			`# Delete in batches`
			`batch_size = 500`
			`for i in range(0, len(junk_point_ids), batch_size):`
			`batch = junk_point_ids[i:i + batch_size]`
			`qdrant.delete(collection_name=collection, points_selector=batch)`
			`log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")`

			`# Mark junk docs as skipped in SQLite`
			`conn = sqlite3.connect(str(DB_PATH))`
			`for doc_hash in junk_doc_hashes:`
			`conn.execute(`
			`"UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",`
			`(doc_hash,)`
			`)`
			`conn.commit()`
			`conn.close()`
			`log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")`

			`stats["junk_vectors"] = len(junk_point_ids)`
			`stats["junk_docs"] = len(junk_doc_hashes)`
			`log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")`
			`return stats`


			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--dry-run", action="store_true")`
			`parser.add_argument("--skip-pass", type=int, action="append", default=[])`
			`args = parser.parse_args()`

			`config = get_config()`
			`keys = load_gemini_keys()`
			`rotator = KeyRotator(keys)`

			`qdrant = QdrantClient(`
			`host=config['vector_db']['host'],`
			`port=config['vector_db']['port'],`
			`timeout=60`
			`)`
			`collection = config['vector_db']['collection']`

			`log.info(f"Starting cleanup \| dry_run={args.dry_run} \| skipping passes: {args.skip_pass}")`

			`if 1 not in args.skip_pass:`
			`pass1_remap_outliers(qdrant, collection, args.dry_run)`

			`if 2 not in args.skip_pass:`
			`pass2_empty_domains(qdrant, collection, rotator, args.dry_run)`

			`if 3 not in args.skip_pass:`
			`pass3_purge_junk(qdrant, collection, args.dry_run)`

			`log.info("All passes complete.")`


			`if __name__ == "__main__":`
			`main()`