Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-14 14:57:23 +00:00 · 2026-04-14 14:57:23 +00:00 · 563c16bb71
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions
--- a/scripts/cleanup_outliers.py
+++ b/scripts/cleanup_outliers.py
@ -0,0 +1,449 @@
+#!/usr/bin/env python3
+"""
+cleanup_outliers.py — Three-pass cleanup of RECON concept data.
+
+Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
+Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
+Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB
+
+Usage:
+  python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
+"""
+
+import json
+import time
+import random
+import logging
+import argparse
+import threading
+import sqlite3
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from collections import defaultdict
+
+import google.generativeai as genai
+from qdrant_client import QdrantClient
+from qdrant_client.models import FieldCondition, MatchAny, Filter
+
+import sys, os
+sys.path.insert(0, '/opt/recon')
+from lib.utils import get_config, setup_logging
+
+LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
+)
+log = logging.getLogger("cleanup_outliers")
+
+CONCEPTS_DIR = Path("/opt/recon/data/concepts")
+DB_PATH = Path("/opt/recon/data/recon.db")
+
+CANONICAL_DOMAINS = {
+    "Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
+    "Foundational Skills", "Communications", "Medical", "Food Systems",
+    "Navigation", "Logistics", "Power Systems", "Leadership",
+    "Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
+}
+
+# Non-canonical → canonical remap
+OUTLIER_MAP = {
+    "Zoology":                  "Sustainment Systems",
+    "Botany":                   "Sustainment Systems",
+    "Nature Lore":              "Sustainment Systems",
+    "Ecology":                  "Sustainment Systems",
+    "Navigational Astronomy":   "Navigation",
+    "Troubleshooting":          "Foundational Skills",
+    "Chemistry":                "Foundational Skills",
+    "Metallurgy":               "Foundational Skills",
+    "Weird Science":            "Foundational Skills",
+    "Philosophy of physics":    "Foundational Skills",
+    "Physics":                  "Foundational Skills",
+    "Cell biology":             "Foundational Skills",
+    "Economics":                "Leadership",
+    "Business":                 "Leadership",
+    "Safety":                   "Security",
+    "Law Enforcement":          "Security",
+    "Security & Intelligence":  "Security",
+    "Fire Weather":             "Scenario Playbooks",
+    "Legal":                    "Leadership",
+    # Discard — replace with closest real domain
+    "Site News":                "Foundational Skills",
+    "Paleogeography":           "Foundational Skills",
+    "Chemical Manipulation":    "Foundational Skills",
+}
+
+# Junk URL patterns — pages with no knowledge value
+JUNK_URL_PATTERNS = [
+    # rocketstoves.com nav/template garbage
+    "rocketstoves.com/favicon",
+    "rocketstoves.com/cropped-favicon",
+    "rocketstoves.com/layouts/",
+    "rocketstoves.com/sample",
+    "rocketstoves.com/templates/",
+    "rocketstoves.com/hello-world",
+    "rocketstoves.com/blog-forthcoming",
+    "rocketstoves.com/contact",
+    "rocketstoves.com/acknowledgements",
+    "rocketstoves.com/ja3",
+    "rocketstoves.com/juxtapositions",
+    "rocketstoves.com/no-name-soi",
+    "rocketstoves.com/big4",
+    "rocketstoves.com/roof",
+    "rocketstoves.com/rmh_dloadcover",
+    "rocketstoves.com/pedcover",
+    "rocketstoves.com/laundry-to-landscape",
+    "rocketstoves.com/barreloven",
+    # NRCS calendar/event noise
+    "nrcs.usda.gov/events/",
+    "nrcs.usda.gov/state-offices/massachusetts",
+    "nrcs.usda.gov/state-offices/nebraska",
+    "nrcs.usda.gov/state-offices/oklahoma",
+    "nrcs.usda.gov/state-offices/utah",
+    "nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
+    # deeranddeerhunting trophy hunt videos (no knowledge value)
+    "deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
+    # eattheweeds non-content pages
+    "eattheweeds.com/media-interviews-with-green-deane",
+    "eattheweeds.com/motorcycles-and-mushrooms",
+    "eattheweeds.com/sunny-savage",
+    # foragersharvest nav pages
+    "foragersharvest.com/contact",
+    "foragersharvest.com/podcasts",
+    # motherearthnews classifieds/nav
+    "motherearthnews.com/classifieds/",
+    "motherearthnews.com/biographies/",
+]
+
+CLASSIFY_PROMPT = """\
+Classify this knowledge concept into one or more domains.
+
+VALID DOMAINS (use ONLY these exact strings):
+  Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
+  Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
+  Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
+
+Concept title: {title}
+Concept tags: {subdomain}
+Concept preview: {content}
+
+Return ONLY valid JSON, no markdown:
+{{"domain": ["Domain Name"]}}
+
+Rules:
+- Never return empty domain list
+- Medical content, herbs, first aid, veterinary → Medical
+- Food growing, foraging, hunting, livestock → Sustainment Systems
+- Food preservation, canning, storage → Food Systems
+- Solar, wind, batteries, generators → Power Systems
+- Water sourcing, filtration, sanitation → Water Systems
+"""
+
+def load_gemini_keys():
+    keys = []
+    for line in Path("/opt/recon/.env").read_text().splitlines():
+        if line.startswith("GEMINI_KEY_"):
+            keys.append(line.split("=", 1)[1].strip())
+    return keys
+
+class KeyRotator:
+    def __init__(self, keys):
+        self.keys = keys
+        self._i = 0
+        self._lock = threading.Lock()
+    def next(self):
+        with self._lock:
+            key = self.keys[self._i % len(self.keys)]
+            self._i += 1
+            return key
+
+def classify_concept(title, subdomains, content, key):
+    prompt = CLASSIFY_PROMPT.format(
+        title=title or "(untitled)",
+        subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
+        content=str(content)[:300] if content else "(none)",
+    )
+    genai.configure(api_key=key)
+    model = genai.GenerativeModel(
+        "gemini-2.0-flash",
+        generation_config={"response_mime_type": "application/json"}
+    )
+    for attempt in range(4):
+        try:
+            resp = model.generate_content(prompt)
+            data = json.loads(resp.text)
+            domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
+            if domains:
+                return domains
+        except Exception as e:
+            err = str(e).lower()
+            if any(s in err for s in ["429", "quota", "rate", "503"]):
+                time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
+            else:
+                break
+    return ["Foundational Skills"]
+
+# ── PASS 1: Remap outlier domains ────────────────────────────────────────────
+
+def remap_concept_domains(domains):
+    """Remap any outlier domain names in a domain list."""
+    result = set()
+    changed = False
+    for d in domains:
+        if d in CANONICAL_DOMAINS:
+            result.add(d)
+        elif d in OUTLIER_MAP:
+            result.add(OUTLIER_MAP[d])
+            changed = True
+        else:
+            changed = True  # drop unknown
+    return list(result), changed
+
+def pass1_remap_outliers(qdrant, collection, dry_run):
+    log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
+    outlier_names = list(OUTLIER_MAP.keys())
+    stats = defaultdict(int)
+
+    # Scroll through Qdrant finding affected vectors
+    offset = None
+    affected_points = []
+
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(
+                    key="domain",
+                    match=MatchAny(any=outlier_names)
+                )]
+            ),
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        affected_points.extend(results)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")
+
+    for point in affected_points:
+        payload = point.payload
+        old_domains = payload.get("domain", [])
+        if isinstance(old_domains, str):
+            old_domains = [old_domains]
+
+        new_domains, changed = remap_concept_domains(old_domains)
+        if not new_domains:
+            new_domains = ["Foundational Skills"]
+
+        if changed:
+            stats["qdrant_updated"] += 1
+            if not dry_run:
+                qdrant.set_payload(
+                    collection_name=collection,
+                    payload={"domain": new_domains},
+                    points=[point.id],
+                )
+
+    # Also fix concept JSON files on disk
+    json_fixed = 0
+    for window_file in CONCEPTS_DIR.rglob("window_*.json"):
+        try:
+            with open(window_file, "r", encoding="utf-8") as f:
+                concepts = json.load(f)
+        except Exception:
+            continue
+
+        if not isinstance(concepts, list):
+            continue
+
+        file_changed = False
+        for concept in concepts:
+            if not isinstance(concept, dict):
+                continue
+            raw = concept.get("domain", [])
+            if isinstance(raw, str):
+                raw = [raw]
+            new, changed = remap_concept_domains(raw)
+            if changed:
+                concept["domain"] = new if new else ["Foundational Skills"]
+                file_changed = True
+
+        if file_changed:
+            json_fixed += 1
+            if not dry_run:
+                with open(window_file, "w", encoding="utf-8") as f:
+                    json.dump(concepts, f, indent=2, ensure_ascii=False)
+
+    log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
+    return stats
+
+# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────
+
+def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
+    log.info("=== PASS 2: Re-enriching empty domain concepts ===")
+    stats = defaultdict(int)
+
+    # Find empty domain points in Qdrant
+    offset = None
+    empty_points = []
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        for r in results:
+            d = r.payload.get("domain", [])
+            if not d or d == [] or d == [""]:
+                empty_points.append(r)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(empty_points)} points with empty domains")
+
+    for point in empty_points:
+        payload = point.payload
+        title = payload.get("title", "")
+        subdomains = payload.get("subdomain", [])
+        content = payload.get("content", payload.get("summary", ""))
+
+        key = key_rotator.next()
+        new_domains = classify_concept(title, subdomains, content, key)
+        stats["classified"] += 1
+
+        if not dry_run:
+            qdrant.set_payload(
+                collection_name=collection,
+                payload={"domain": new_domains},
+                points=[point.id],
+            )
+
+        # Also update the concept JSON on disk
+        doc_hash = payload.get("doc_hash", "")
+        if doc_hash:
+            doc_concepts_dir = CONCEPTS_DIR / doc_hash
+            if doc_concepts_dir.exists():
+                for wf in doc_concepts_dir.glob("window_*.json"):
+                    try:
+                        with open(wf, "r", encoding="utf-8") as f:
+                            concepts = json.load(f)
+                        changed = False
+                        for c in concepts:
+                            if isinstance(c, dict) and c.get("title") == title:
+                                d = c.get("domain", [])
+                                if not d or d == []:
+                                    c["domain"] = new_domains
+                                    changed = True
+                        if changed and not dry_run:
+                            with open(wf, "w", encoding="utf-8") as f:
+                                json.dump(concepts, f, indent=2, ensure_ascii=False)
+                    except Exception:
+                        pass
+
+        time.sleep(0.05)
+
+    log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
+    return stats
+
+# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────
+
+def is_junk_url(url):
+    url_lower = url.lower()
+    return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)
+
+def pass3_purge_junk(qdrant, collection, dry_run):
+    log.info("=== PASS 3: Purging junk URLs ===")
+    stats = defaultdict(int)
+
+    # Scroll all web-source points and find junk
+    offset = None
+    junk_point_ids = []
+    junk_doc_hashes = set()
+
+    while True:
+        results, offset = qdrant.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
+            ),
+            limit=500,
+            with_payload=True,
+            with_vectors=False,
+            offset=offset,
+        )
+        for r in results:
+            filename = r.payload.get("filename", "")
+            doc_hash = r.payload.get("doc_hash", "")
+            if is_junk_url(filename):
+                junk_point_ids.append(r.id)
+                if doc_hash:
+                    junk_doc_hashes.add(doc_hash)
+        if offset is None:
+            break
+
+    log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")
+
+    if not dry_run and junk_point_ids:
+        # Delete in batches
+        batch_size = 500
+        for i in range(0, len(junk_point_ids), batch_size):
+            batch = junk_point_ids[i:i + batch_size]
+            qdrant.delete(collection_name=collection, points_selector=batch)
+        log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")
+
+        # Mark junk docs as skipped in SQLite
+        conn = sqlite3.connect(str(DB_PATH))
+        for doc_hash in junk_doc_hashes:
+            conn.execute(
+                "UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
+                (doc_hash,)
+            )
+        conn.commit()
+        conn.close()
+        log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")
+
+    stats["junk_vectors"] = len(junk_point_ids)
+    stats["junk_docs"] = len(junk_doc_hashes)
+    log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--skip-pass", type=int, action="append", default=[])
+    args = parser.parse_args()
+
+    config = get_config()
+    keys = load_gemini_keys()
+    rotator = KeyRotator(keys)
+
+    qdrant = QdrantClient(
+        host=config['vector_db']['host'],
+        port=config['vector_db']['port'],
+        timeout=60
+    )
+    collection = config['vector_db']['collection']
+
+    log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")
+
+    if 1 not in args.skip_pass:
+        pass1_remap_outliers(qdrant, collection, args.dry_run)
+
+    if 2 not in args.skip_pass:
+        pass2_empty_domains(qdrant, collection, rotator, args.dry_run)
+
+    if 3 not in args.skip_pass:
+        pass3_purge_junk(qdrant, collection, args.dry_run)
+
+    log.info("All passes complete.")
+
+
+if __name__ == "__main__":
+    main()