recon/scripts/validate.py

#!/usr/bin/env python3
"""
RECON Pipeline Validator

Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.
Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.

Usage: python3 scripts/validate.py [--deep]
"""

import json
import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from lib.utils import get_config, setup_logging
from lib.status import StatusDB

logger = setup_logging('recon.validate')


def run_validation(deep=False):
    config = get_config()
    db = StatusDB()

    issues = []
    warnings = []

    print("=== RECON Validation ===\n")

    # Check paths
    for name, path in config['paths'].items():
        if name == 'db':
            if not os.path.exists(path):
                issues.append(f"Database not found: {path}")
        else:
            if not os.path.exists(path):
                warnings.append(f"Directory missing: {name} = {path}")

    # Check library
    if not os.path.exists(config['library_root']):
        issues.append(f"Library root not found: {config['library_root']}")

    # Check Gemini keys
    keys = config.get('gemini_keys', [])
    if not keys:
        warnings.append("No Gemini API keys configured in .env")
    else:
        print(f"  Gemini keys: {len(keys)} configured")

    # DB status counts
    counts = db.get_status_counts()
    cat = counts.get('catalogue', {})
    doc = counts.get('documents', {})

    print(f"  Catalogue: {sum(cat.values())} entries")
    print(f"  Documents: {sum(doc.values())} entries")
    print(f"  Complete: {doc.get('complete', 0)}")
    print(f"  Failed: {doc.get('failed', 0)}")

    if deep:
        print("\n--- Deep Validation ---\n")

        # Check every document in pipeline has corresponding files
        all_docs = db.get_all_documents()
        text_dir = config['paths']['text']
        concepts_dir = config['paths']['concepts']

        for d in all_docs:
            h = d['hash']
            status = d['status']

            if status in ('extracted', 'enriched', 'complete'):
                doc_text_dir = os.path.join(text_dir, h)
                if not os.path.exists(doc_text_dir):
                    issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")
                elif deep:
                    pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]
                    if not pages:
                        issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")

            if status in ('enriched', 'complete'):
                doc_concepts_dir = os.path.join(concepts_dir, h)
                if not os.path.exists(doc_concepts_dir):
                    issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")
                elif deep:
                    windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]
                    if not windows:
                        issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")
                    else:
                        for wf in windows:
                            try:
                                with open(os.path.join(doc_concepts_dir, wf)) as f:
                                    data = json.load(f)
                                if not isinstance(data, list):
                                    issues.append(f"[{h[:8]}] {wf}: not a JSON array")
                            except json.JSONDecodeError:
                                issues.append(f"[{h[:8]}] {wf}: invalid JSON")

        # Check orphaned directories
        if os.path.exists(text_dir):
            doc_hashes = {d['hash'] for d in all_docs}
            for dirname in os.listdir(text_dir):
                if dirname not in doc_hashes:
                    warnings.append(f"Orphaned text dir: {dirname}")

        if os.path.exists(concepts_dir):
            for dirname in os.listdir(concepts_dir):
                if dirname not in doc_hashes:
                    warnings.append(f"Orphaned concepts dir: {dirname}")

        print(f"  Checked {len(all_docs)} documents")

    # Connectivity checks
    print("\n--- Connectivity ---\n")

    import requests as http_requests

    # Check TEI (primary embedding backend)
    try:
        tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"
        resp = http_requests.get(tei_url, timeout=10)
        if resp.status_code == 200:
            print(f"  TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")
        else:
            issues.append(f"TEI: HTTP {resp.status_code}")
    except Exception as e:
        issues.append(f"TEI: unreachable ({e})")

    # Check Ollama (fallback)
    try:
        ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"
        resp = http_requests.get(ollama_url, timeout=10)
        if resp.status_code == 200:
            print(f"  Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")
        else:
            warnings.append(f"Ollama: HTTP {resp.status_code}")
    except Exception as e:
        warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")

    try:
        from qdrant_client import QdrantClient
        qdrant = QdrantClient(
            host=config['vector_db']['host'],
            port=config['vector_db']['port'],
            timeout=10
        )
        collections = [c.name for c in qdrant.get_collections().collections]
        target = config['vector_db']['collection']
        if target in collections:
            info = qdrant.get_collection(target)
            print(f"  Qdrant: OK ({target}: {info.points_count} points)")
        else:
            issues.append(f"Qdrant: collection {target} not found")
    except Exception as e:
        issues.append(f"Qdrant: unreachable ({e})")

    # Summary
    print("\n--- Summary ---\n")

    if warnings:
        print(f"Warnings ({len(warnings)}):")
        for w in warnings:
            print(f"  ⚠ {w}")

    if issues:
        print(f"\nIssues ({len(issues)}):")
        for i in issues:
            print(f"  ✗ {i}")
        print(f"\nValidation FAILED: {len(issues)} issue(s)")
    else:
        print("Validation PASSED")


if __name__ == '__main__':
    deep = '--deep' in sys.argv
    run_validation(deep=deep)
Initial commit: RECON codebase baseline Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-04-14 14:57:23 +00:00			`#!/usr/bin/env python3`
			`"""`
			`RECON Pipeline Validator`

			`Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.`
			`Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.`

			`Usage: python3 scripts/validate.py [--deep]`
			`"""`

			`import json`
			`import os`
			`import sys`

			`sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))`

			`from lib.utils import get_config, setup_logging`
			`from lib.status import StatusDB`

			`logger = setup_logging('recon.validate')`


			`def run_validation(deep=False):`
			`config = get_config()`
			`db = StatusDB()`

			`issues = []`
			`warnings = []`

			`print("=== RECON Validation ===\n")`

			`# Check paths`
			`for name, path in config['paths'].items():`
			`if name == 'db':`
			`if not os.path.exists(path):`
			`issues.append(f"Database not found: {path}")`
			`else:`
			`if not os.path.exists(path):`
			`warnings.append(f"Directory missing: {name} = {path}")`

			`# Check library`
			`if not os.path.exists(config['library_root']):`
			`issues.append(f"Library root not found: {config['library_root']}")`

			`# Check Gemini keys`
			`keys = config.get('gemini_keys', [])`
			`if not keys:`
			`warnings.append("No Gemini API keys configured in .env")`
			`else:`
			`print(f" Gemini keys: {len(keys)} configured")`

			`# DB status counts`
			`counts = db.get_status_counts()`
			`cat = counts.get('catalogue', {})`
			`doc = counts.get('documents', {})`

			`print(f" Catalogue: {sum(cat.values())} entries")`
			`print(f" Documents: {sum(doc.values())} entries")`
			`print(f" Complete: {doc.get('complete', 0)}")`
			`print(f" Failed: {doc.get('failed', 0)}")`

			`if deep:`
			`print("\n--- Deep Validation ---\n")`

			`# Check every document in pipeline has corresponding files`
			`all_docs = db.get_all_documents()`
			`text_dir = config['paths']['text']`
			`concepts_dir = config['paths']['concepts']`

			`for d in all_docs:`
			`h = d['hash']`
			`status = d['status']`

			`if status in ('extracted', 'enriched', 'complete'):`
			`doc_text_dir = os.path.join(text_dir, h)`
			`if not os.path.exists(doc_text_dir):`
			`issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")`
			`elif deep:`
			`pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]`
			`if not pages:`
			`issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")`

			`if status in ('enriched', 'complete'):`
			`doc_concepts_dir = os.path.join(concepts_dir, h)`
			`if not os.path.exists(doc_concepts_dir):`
			`issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")`
			`elif deep:`
			`windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]`
			`if not windows:`
			`issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")`
			`else:`
			`for wf in windows:`
			`try:`
			`with open(os.path.join(doc_concepts_dir, wf)) as f:`
			`data = json.load(f)`
			`if not isinstance(data, list):`
			`issues.append(f"[{h[:8]}] {wf}: not a JSON array")`
			`except json.JSONDecodeError:`
			`issues.append(f"[{h[:8]}] {wf}: invalid JSON")`

			`# Check orphaned directories`
			`if os.path.exists(text_dir):`
			`doc_hashes = {d['hash'] for d in all_docs}`
			`for dirname in os.listdir(text_dir):`
			`if dirname not in doc_hashes:`
			`warnings.append(f"Orphaned text dir: {dirname}")`

			`if os.path.exists(concepts_dir):`
			`for dirname in os.listdir(concepts_dir):`
			`if dirname not in doc_hashes:`
			`warnings.append(f"Orphaned concepts dir: {dirname}")`

			`print(f" Checked {len(all_docs)} documents")`

			`# Connectivity checks`
			`print("\n--- Connectivity ---\n")`

			`import requests as http_requests`

			`# Check TEI (primary embedding backend)`
			`try:`
			`tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"`
			`resp = http_requests.get(tei_url, timeout=10)`
			`if resp.status_code == 200:`
			`print(f" TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")`
			`else:`
			`issues.append(f"TEI: HTTP {resp.status_code}")`
			`except Exception as e:`
			`issues.append(f"TEI: unreachable ({e})")`

			`# Check Ollama (fallback)`
			`try:`
			`ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"`
			`resp = http_requests.get(ollama_url, timeout=10)`
			`if resp.status_code == 200:`
			`print(f" Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")`
			`else:`
			`warnings.append(f"Ollama: HTTP {resp.status_code}")`
			`except Exception as e:`
			`warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")`

			`try:`
			`from qdrant_client import QdrantClient`
			`qdrant = QdrantClient(`
			`host=config['vector_db']['host'],`
			`port=config['vector_db']['port'],`
			`timeout=10`
			`)`
			`collections = [c.name for c in qdrant.get_collections().collections]`
			`target = config['vector_db']['collection']`
			`if target in collections:`
			`info = qdrant.get_collection(target)`
			`print(f" Qdrant: OK ({target}: {info.points_count} points)")`
			`else:`
			`issues.append(f"Qdrant: collection {target} not found")`
			`except Exception as e:`
			`issues.append(f"Qdrant: unreachable ({e})")`

			`# Summary`
			`print("\n--- Summary ---\n")`

			`if warnings:`
			`print(f"Warnings ({len(warnings)}):")`
			`for w in warnings:`
			`print(f" ⚠ {w}")`

			`if issues:`
			`print(f"\nIssues ({len(issues)}):")`
			`for i in issues:`
			`print(f" ✗ {i}")`
			`print(f"\nValidation FAILED: {len(issues)} issue(s)")`
			`else:`
			`print("Validation PASSED")`


			`if __name__ == '__main__':`
			`deep = '--deep' in sys.argv`
			`run_validation(deep=deep)`