mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
449 lines
16 KiB
Python
449 lines
16 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
cleanup_outliers.py — Three-pass cleanup of RECON concept data.
|
||
|
|
|
||
|
|
Pass 1: Remap ~160 non-canonical domain strings in concept JSONs + Qdrant payloads
|
||
|
|
Pass 2: Re-enrich 434 concepts with empty domain arrays via Gemini
|
||
|
|
Pass 3: Purge junk/noise URLs from Qdrant + SQLite DB
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 /opt/recon/scripts/cleanup_outliers.py [--dry-run] [--skip-pass N]
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
import random
|
||
|
|
import logging
|
||
|
|
import argparse
|
||
|
|
import threading
|
||
|
|
import sqlite3
|
||
|
|
from pathlib import Path
|
||
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
import google.generativeai as genai
|
||
|
|
from qdrant_client import QdrantClient
|
||
|
|
from qdrant_client.models import FieldCondition, MatchAny, Filter
|
||
|
|
|
||
|
|
import sys, os
|
||
|
|
sys.path.insert(0, '/opt/recon')
|
||
|
|
from lib.utils import get_config, setup_logging
|
||
|
|
|
||
|
|
LOG_FILE = Path("/opt/recon/logs/cleanup_outliers.log")
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
||
|
|
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||
|
|
)
|
||
|
|
log = logging.getLogger("cleanup_outliers")
|
||
|
|
|
||
|
|
CONCEPTS_DIR = Path("/opt/recon/data/concepts")
|
||
|
|
DB_PATH = Path("/opt/recon/data/recon.db")
|
||
|
|
|
||
|
|
CANONICAL_DOMAINS = {
|
||
|
|
"Defense & Tactics", "Sustainment Systems", "Off-Grid Systems",
|
||
|
|
"Foundational Skills", "Communications", "Medical", "Food Systems",
|
||
|
|
"Navigation", "Logistics", "Power Systems", "Leadership",
|
||
|
|
"Scenario Playbooks", "Water Systems", "Security", "Community Coordination"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Non-canonical → canonical remap
|
||
|
|
OUTLIER_MAP = {
|
||
|
|
"Zoology": "Sustainment Systems",
|
||
|
|
"Botany": "Sustainment Systems",
|
||
|
|
"Nature Lore": "Sustainment Systems",
|
||
|
|
"Ecology": "Sustainment Systems",
|
||
|
|
"Navigational Astronomy": "Navigation",
|
||
|
|
"Troubleshooting": "Foundational Skills",
|
||
|
|
"Chemistry": "Foundational Skills",
|
||
|
|
"Metallurgy": "Foundational Skills",
|
||
|
|
"Weird Science": "Foundational Skills",
|
||
|
|
"Philosophy of physics": "Foundational Skills",
|
||
|
|
"Physics": "Foundational Skills",
|
||
|
|
"Cell biology": "Foundational Skills",
|
||
|
|
"Economics": "Leadership",
|
||
|
|
"Business": "Leadership",
|
||
|
|
"Safety": "Security",
|
||
|
|
"Law Enforcement": "Security",
|
||
|
|
"Security & Intelligence": "Security",
|
||
|
|
"Fire Weather": "Scenario Playbooks",
|
||
|
|
"Legal": "Leadership",
|
||
|
|
# Discard — replace with closest real domain
|
||
|
|
"Site News": "Foundational Skills",
|
||
|
|
"Paleogeography": "Foundational Skills",
|
||
|
|
"Chemical Manipulation": "Foundational Skills",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Junk URL patterns — pages with no knowledge value
|
||
|
|
JUNK_URL_PATTERNS = [
|
||
|
|
# rocketstoves.com nav/template garbage
|
||
|
|
"rocketstoves.com/favicon",
|
||
|
|
"rocketstoves.com/cropped-favicon",
|
||
|
|
"rocketstoves.com/layouts/",
|
||
|
|
"rocketstoves.com/sample",
|
||
|
|
"rocketstoves.com/templates/",
|
||
|
|
"rocketstoves.com/hello-world",
|
||
|
|
"rocketstoves.com/blog-forthcoming",
|
||
|
|
"rocketstoves.com/contact",
|
||
|
|
"rocketstoves.com/acknowledgements",
|
||
|
|
"rocketstoves.com/ja3",
|
||
|
|
"rocketstoves.com/juxtapositions",
|
||
|
|
"rocketstoves.com/no-name-soi",
|
||
|
|
"rocketstoves.com/big4",
|
||
|
|
"rocketstoves.com/roof",
|
||
|
|
"rocketstoves.com/rmh_dloadcover",
|
||
|
|
"rocketstoves.com/pedcover",
|
||
|
|
"rocketstoves.com/laundry-to-landscape",
|
||
|
|
"rocketstoves.com/barreloven",
|
||
|
|
# NRCS calendar/event noise
|
||
|
|
"nrcs.usda.gov/events/",
|
||
|
|
"nrcs.usda.gov/state-offices/massachusetts",
|
||
|
|
"nrcs.usda.gov/state-offices/nebraska",
|
||
|
|
"nrcs.usda.gov/state-offices/oklahoma",
|
||
|
|
"nrcs.usda.gov/state-offices/utah",
|
||
|
|
"nrcs.usda.gov/conservation-basics/natural-resource-concerns/soil/western-call-for-abstracts",
|
||
|
|
# deeranddeerhunting trophy hunt videos (no knowledge value)
|
||
|
|
"deeranddeerhunting.com/trophy-whitetails-exclusive-videos/",
|
||
|
|
# eattheweeds non-content pages
|
||
|
|
"eattheweeds.com/media-interviews-with-green-deane",
|
||
|
|
"eattheweeds.com/motorcycles-and-mushrooms",
|
||
|
|
"eattheweeds.com/sunny-savage",
|
||
|
|
# foragersharvest nav pages
|
||
|
|
"foragersharvest.com/contact",
|
||
|
|
"foragersharvest.com/podcasts",
|
||
|
|
# motherearthnews classifieds/nav
|
||
|
|
"motherearthnews.com/classifieds/",
|
||
|
|
"motherearthnews.com/biographies/",
|
||
|
|
]
|
||
|
|
|
||
|
|
CLASSIFY_PROMPT = """\
|
||
|
|
Classify this knowledge concept into one or more domains.
|
||
|
|
|
||
|
|
VALID DOMAINS (use ONLY these exact strings):
|
||
|
|
Defense & Tactics, Sustainment Systems, Off-Grid Systems, Foundational Skills,
|
||
|
|
Communications, Medical, Food Systems, Navigation, Logistics, Power Systems,
|
||
|
|
Leadership, Scenario Playbooks, Water Systems, Security, Community Coordination
|
||
|
|
|
||
|
|
Concept title: {title}
|
||
|
|
Concept tags: {subdomain}
|
||
|
|
Concept preview: {content}
|
||
|
|
|
||
|
|
Return ONLY valid JSON, no markdown:
|
||
|
|
{{"domain": ["Domain Name"]}}
|
||
|
|
|
||
|
|
Rules:
|
||
|
|
- Never return empty domain list
|
||
|
|
- Medical content, herbs, first aid, veterinary → Medical
|
||
|
|
- Food growing, foraging, hunting, livestock → Sustainment Systems
|
||
|
|
- Food preservation, canning, storage → Food Systems
|
||
|
|
- Solar, wind, batteries, generators → Power Systems
|
||
|
|
- Water sourcing, filtration, sanitation → Water Systems
|
||
|
|
"""
|
||
|
|
|
||
|
|
def load_gemini_keys():
|
||
|
|
keys = []
|
||
|
|
for line in Path("/opt/recon/.env").read_text().splitlines():
|
||
|
|
if line.startswith("GEMINI_KEY_"):
|
||
|
|
keys.append(line.split("=", 1)[1].strip())
|
||
|
|
return keys
|
||
|
|
|
||
|
|
class KeyRotator:
|
||
|
|
def __init__(self, keys):
|
||
|
|
self.keys = keys
|
||
|
|
self._i = 0
|
||
|
|
self._lock = threading.Lock()
|
||
|
|
def next(self):
|
||
|
|
with self._lock:
|
||
|
|
key = self.keys[self._i % len(self.keys)]
|
||
|
|
self._i += 1
|
||
|
|
return key
|
||
|
|
|
||
|
|
def classify_concept(title, subdomains, content, key):
|
||
|
|
prompt = CLASSIFY_PROMPT.format(
|
||
|
|
title=title or "(untitled)",
|
||
|
|
subdomain=", ".join(subdomains[:10]) if subdomains else "(none)",
|
||
|
|
content=str(content)[:300] if content else "(none)",
|
||
|
|
)
|
||
|
|
genai.configure(api_key=key)
|
||
|
|
model = genai.GenerativeModel(
|
||
|
|
"gemini-2.0-flash",
|
||
|
|
generation_config={"response_mime_type": "application/json"}
|
||
|
|
)
|
||
|
|
for attempt in range(4):
|
||
|
|
try:
|
||
|
|
resp = model.generate_content(prompt)
|
||
|
|
data = json.loads(resp.text)
|
||
|
|
domains = [d for d in data.get("domain", []) if d in CANONICAL_DOMAINS]
|
||
|
|
if domains:
|
||
|
|
return domains
|
||
|
|
except Exception as e:
|
||
|
|
err = str(e).lower()
|
||
|
|
if any(s in err for s in ["429", "quota", "rate", "503"]):
|
||
|
|
time.sleep(min(5 * (2 ** attempt) + random.uniform(0, 3), 60))
|
||
|
|
else:
|
||
|
|
break
|
||
|
|
return ["Foundational Skills"]
|
||
|
|
|
||
|
|
# ── PASS 1: Remap outlier domains ────────────────────────────────────────────
|
||
|
|
|
||
|
|
def remap_concept_domains(domains):
|
||
|
|
"""Remap any outlier domain names in a domain list."""
|
||
|
|
result = set()
|
||
|
|
changed = False
|
||
|
|
for d in domains:
|
||
|
|
if d in CANONICAL_DOMAINS:
|
||
|
|
result.add(d)
|
||
|
|
elif d in OUTLIER_MAP:
|
||
|
|
result.add(OUTLIER_MAP[d])
|
||
|
|
changed = True
|
||
|
|
else:
|
||
|
|
changed = True # drop unknown
|
||
|
|
return list(result), changed
|
||
|
|
|
||
|
|
def pass1_remap_outliers(qdrant, collection, dry_run):
|
||
|
|
log.info("=== PASS 1: Remapping non-canonical outlier domains ===")
|
||
|
|
outlier_names = list(OUTLIER_MAP.keys())
|
||
|
|
stats = defaultdict(int)
|
||
|
|
|
||
|
|
# Scroll through Qdrant finding affected vectors
|
||
|
|
offset = None
|
||
|
|
affected_points = []
|
||
|
|
|
||
|
|
while True:
|
||
|
|
results, offset = qdrant.scroll(
|
||
|
|
collection_name=collection,
|
||
|
|
scroll_filter=Filter(
|
||
|
|
must=[FieldCondition(
|
||
|
|
key="domain",
|
||
|
|
match=MatchAny(any=outlier_names)
|
||
|
|
)]
|
||
|
|
),
|
||
|
|
limit=500,
|
||
|
|
with_payload=True,
|
||
|
|
with_vectors=False,
|
||
|
|
offset=offset,
|
||
|
|
)
|
||
|
|
affected_points.extend(results)
|
||
|
|
if offset is None:
|
||
|
|
break
|
||
|
|
|
||
|
|
log.info(f"Found {len(affected_points)} Qdrant points with outlier domains")
|
||
|
|
|
||
|
|
for point in affected_points:
|
||
|
|
payload = point.payload
|
||
|
|
old_domains = payload.get("domain", [])
|
||
|
|
if isinstance(old_domains, str):
|
||
|
|
old_domains = [old_domains]
|
||
|
|
|
||
|
|
new_domains, changed = remap_concept_domains(old_domains)
|
||
|
|
if not new_domains:
|
||
|
|
new_domains = ["Foundational Skills"]
|
||
|
|
|
||
|
|
if changed:
|
||
|
|
stats["qdrant_updated"] += 1
|
||
|
|
if not dry_run:
|
||
|
|
qdrant.set_payload(
|
||
|
|
collection_name=collection,
|
||
|
|
payload={"domain": new_domains},
|
||
|
|
points=[point.id],
|
||
|
|
)
|
||
|
|
|
||
|
|
# Also fix concept JSON files on disk
|
||
|
|
json_fixed = 0
|
||
|
|
for window_file in CONCEPTS_DIR.rglob("window_*.json"):
|
||
|
|
try:
|
||
|
|
with open(window_file, "r", encoding="utf-8") as f:
|
||
|
|
concepts = json.load(f)
|
||
|
|
except Exception:
|
||
|
|
continue
|
||
|
|
|
||
|
|
if not isinstance(concepts, list):
|
||
|
|
continue
|
||
|
|
|
||
|
|
file_changed = False
|
||
|
|
for concept in concepts:
|
||
|
|
if not isinstance(concept, dict):
|
||
|
|
continue
|
||
|
|
raw = concept.get("domain", [])
|
||
|
|
if isinstance(raw, str):
|
||
|
|
raw = [raw]
|
||
|
|
new, changed = remap_concept_domains(raw)
|
||
|
|
if changed:
|
||
|
|
concept["domain"] = new if new else ["Foundational Skills"]
|
||
|
|
file_changed = True
|
||
|
|
|
||
|
|
if file_changed:
|
||
|
|
json_fixed += 1
|
||
|
|
if not dry_run:
|
||
|
|
with open(window_file, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||
|
|
|
||
|
|
log.info(f"Pass 1 complete: {stats['qdrant_updated']} Qdrant points updated, {json_fixed} JSON files updated")
|
||
|
|
return stats
|
||
|
|
|
||
|
|
# ── PASS 2: Re-enrich empty domain concepts ──────────────────────────────────
|
||
|
|
|
||
|
|
def pass2_empty_domains(qdrant, collection, key_rotator, dry_run):
|
||
|
|
log.info("=== PASS 2: Re-enriching empty domain concepts ===")
|
||
|
|
stats = defaultdict(int)
|
||
|
|
|
||
|
|
# Find empty domain points in Qdrant
|
||
|
|
offset = None
|
||
|
|
empty_points = []
|
||
|
|
while True:
|
||
|
|
results, offset = qdrant.scroll(
|
||
|
|
collection_name=collection,
|
||
|
|
limit=500,
|
||
|
|
with_payload=True,
|
||
|
|
with_vectors=False,
|
||
|
|
offset=offset,
|
||
|
|
)
|
||
|
|
for r in results:
|
||
|
|
d = r.payload.get("domain", [])
|
||
|
|
if not d or d == [] or d == [""]:
|
||
|
|
empty_points.append(r)
|
||
|
|
if offset is None:
|
||
|
|
break
|
||
|
|
|
||
|
|
log.info(f"Found {len(empty_points)} points with empty domains")
|
||
|
|
|
||
|
|
for point in empty_points:
|
||
|
|
payload = point.payload
|
||
|
|
title = payload.get("title", "")
|
||
|
|
subdomains = payload.get("subdomain", [])
|
||
|
|
content = payload.get("content", payload.get("summary", ""))
|
||
|
|
|
||
|
|
key = key_rotator.next()
|
||
|
|
new_domains = classify_concept(title, subdomains, content, key)
|
||
|
|
stats["classified"] += 1
|
||
|
|
|
||
|
|
if not dry_run:
|
||
|
|
qdrant.set_payload(
|
||
|
|
collection_name=collection,
|
||
|
|
payload={"domain": new_domains},
|
||
|
|
points=[point.id],
|
||
|
|
)
|
||
|
|
|
||
|
|
# Also update the concept JSON on disk
|
||
|
|
doc_hash = payload.get("doc_hash", "")
|
||
|
|
if doc_hash:
|
||
|
|
doc_concepts_dir = CONCEPTS_DIR / doc_hash
|
||
|
|
if doc_concepts_dir.exists():
|
||
|
|
for wf in doc_concepts_dir.glob("window_*.json"):
|
||
|
|
try:
|
||
|
|
with open(wf, "r", encoding="utf-8") as f:
|
||
|
|
concepts = json.load(f)
|
||
|
|
changed = False
|
||
|
|
for c in concepts:
|
||
|
|
if isinstance(c, dict) and c.get("title") == title:
|
||
|
|
d = c.get("domain", [])
|
||
|
|
if not d or d == []:
|
||
|
|
c["domain"] = new_domains
|
||
|
|
changed = True
|
||
|
|
if changed and not dry_run:
|
||
|
|
with open(wf, "w", encoding="utf-8") as f:
|
||
|
|
json.dump(concepts, f, indent=2, ensure_ascii=False)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
time.sleep(0.05)
|
||
|
|
|
||
|
|
log.info(f"Pass 2 complete: {stats['classified']} concepts re-classified")
|
||
|
|
return stats
|
||
|
|
|
||
|
|
# ── PASS 3: Purge junk URLs ──────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def is_junk_url(url):
|
||
|
|
url_lower = url.lower()
|
||
|
|
return any(pattern.lower() in url_lower for pattern in JUNK_URL_PATTERNS)
|
||
|
|
|
||
|
|
def pass3_purge_junk(qdrant, collection, dry_run):
|
||
|
|
log.info("=== PASS 3: Purging junk URLs ===")
|
||
|
|
stats = defaultdict(int)
|
||
|
|
|
||
|
|
# Scroll all web-source points and find junk
|
||
|
|
offset = None
|
||
|
|
junk_point_ids = []
|
||
|
|
junk_doc_hashes = set()
|
||
|
|
|
||
|
|
while True:
|
||
|
|
results, offset = qdrant.scroll(
|
||
|
|
collection_name=collection,
|
||
|
|
scroll_filter=Filter(
|
||
|
|
must=[FieldCondition(key="source_type", match=MatchAny(any=["web"]))]
|
||
|
|
),
|
||
|
|
limit=500,
|
||
|
|
with_payload=True,
|
||
|
|
with_vectors=False,
|
||
|
|
offset=offset,
|
||
|
|
)
|
||
|
|
for r in results:
|
||
|
|
filename = r.payload.get("filename", "")
|
||
|
|
doc_hash = r.payload.get("doc_hash", "")
|
||
|
|
if is_junk_url(filename):
|
||
|
|
junk_point_ids.append(r.id)
|
||
|
|
if doc_hash:
|
||
|
|
junk_doc_hashes.add(doc_hash)
|
||
|
|
if offset is None:
|
||
|
|
break
|
||
|
|
|
||
|
|
log.info(f"Found {len(junk_point_ids)} junk vectors across {len(junk_doc_hashes)} documents")
|
||
|
|
|
||
|
|
if not dry_run and junk_point_ids:
|
||
|
|
# Delete in batches
|
||
|
|
batch_size = 500
|
||
|
|
for i in range(0, len(junk_point_ids), batch_size):
|
||
|
|
batch = junk_point_ids[i:i + batch_size]
|
||
|
|
qdrant.delete(collection_name=collection, points_selector=batch)
|
||
|
|
log.info(f"Deleted {len(junk_point_ids)} junk vectors from Qdrant")
|
||
|
|
|
||
|
|
# Mark junk docs as skipped in SQLite
|
||
|
|
conn = sqlite3.connect(str(DB_PATH))
|
||
|
|
for doc_hash in junk_doc_hashes:
|
||
|
|
conn.execute(
|
||
|
|
"UPDATE documents SET status = 'skipped', error_message = 'junk content purged' WHERE hash = ?",
|
||
|
|
(doc_hash,)
|
||
|
|
)
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
log.info(f"Marked {len(junk_doc_hashes)} documents as skipped in DB")
|
||
|
|
|
||
|
|
stats["junk_vectors"] = len(junk_point_ids)
|
||
|
|
stats["junk_docs"] = len(junk_doc_hashes)
|
||
|
|
log.info(f"Pass 3 complete: {stats['junk_vectors']} vectors, {stats['junk_docs']} docs purged")
|
||
|
|
return stats
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--dry-run", action="store_true")
|
||
|
|
parser.add_argument("--skip-pass", type=int, action="append", default=[])
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
keys = load_gemini_keys()
|
||
|
|
rotator = KeyRotator(keys)
|
||
|
|
|
||
|
|
qdrant = QdrantClient(
|
||
|
|
host=config['vector_db']['host'],
|
||
|
|
port=config['vector_db']['port'],
|
||
|
|
timeout=60
|
||
|
|
)
|
||
|
|
collection = config['vector_db']['collection']
|
||
|
|
|
||
|
|
log.info(f"Starting cleanup | dry_run={args.dry_run} | skipping passes: {args.skip_pass}")
|
||
|
|
|
||
|
|
if 1 not in args.skip_pass:
|
||
|
|
pass1_remap_outliers(qdrant, collection, args.dry_run)
|
||
|
|
|
||
|
|
if 2 not in args.skip_pass:
|
||
|
|
pass2_empty_domains(qdrant, collection, rotator, args.dry_run)
|
||
|
|
|
||
|
|
if 3 not in args.skip_pass:
|
||
|
|
pass3_purge_junk(qdrant, collection, args.dry_run)
|
||
|
|
|
||
|
|
log.info("All passes complete.")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|