Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-14 14:57:23 +00:00 · 2026-04-14 14:57:23 +00:00 · 563c16bb71
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions
--- a/scripts/validate.py
+++ b/scripts/validate.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+RECON Pipeline Validator
+
+Checks pipeline consistency: paths, DB state, file integrity, and service connectivity.
+Validates TEI, Ollama, and Qdrant are reachable. Deep mode checks every document on disk.
+
+Usage: python3 scripts/validate.py [--deep]
+"""
+
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from lib.utils import get_config, setup_logging
+from lib.status import StatusDB
+
+logger = setup_logging('recon.validate')
+
+
+def run_validation(deep=False):
+    config = get_config()
+    db = StatusDB()
+
+    issues = []
+    warnings = []
+
+    print("=== RECON Validation ===\n")
+
+    # Check paths
+    for name, path in config['paths'].items():
+        if name == 'db':
+            if not os.path.exists(path):
+                issues.append(f"Database not found: {path}")
+        else:
+            if not os.path.exists(path):
+                warnings.append(f"Directory missing: {name} = {path}")
+
+    # Check library
+    if not os.path.exists(config['library_root']):
+        issues.append(f"Library root not found: {config['library_root']}")
+
+    # Check Gemini keys
+    keys = config.get('gemini_keys', [])
+    if not keys:
+        warnings.append("No Gemini API keys configured in .env")
+    else:
+        print(f"  Gemini keys: {len(keys)} configured")
+
+    # DB status counts
+    counts = db.get_status_counts()
+    cat = counts.get('catalogue', {})
+    doc = counts.get('documents', {})
+
+    print(f"  Catalogue: {sum(cat.values())} entries")
+    print(f"  Documents: {sum(doc.values())} entries")
+    print(f"  Complete: {doc.get('complete', 0)}")
+    print(f"  Failed: {doc.get('failed', 0)}")
+
+    if deep:
+        print("\n--- Deep Validation ---\n")
+
+        # Check every document in pipeline has corresponding files
+        all_docs = db.get_all_documents()
+        text_dir = config['paths']['text']
+        concepts_dir = config['paths']['concepts']
+
+        for d in all_docs:
+            h = d['hash']
+            status = d['status']
+
+            if status in ('extracted', 'enriched', 'complete'):
+                doc_text_dir = os.path.join(text_dir, h)
+                if not os.path.exists(doc_text_dir):
+                    issues.append(f"[{h[:8]}] {d['filename']}: text dir missing but status={status}")
+                elif deep:
+                    pages = [f for f in os.listdir(doc_text_dir) if f.startswith('page_')]
+                    if not pages:
+                        issues.append(f"[{h[:8]}] {d['filename']}: no page files in text dir")
+
+            if status in ('enriched', 'complete'):
+                doc_concepts_dir = os.path.join(concepts_dir, h)
+                if not os.path.exists(doc_concepts_dir):
+                    issues.append(f"[{h[:8]}] {d['filename']}: concepts dir missing but status={status}")
+                elif deep:
+                    windows = [f for f in os.listdir(doc_concepts_dir) if f.startswith('window_')]
+                    if not windows:
+                        issues.append(f"[{h[:8]}] {d['filename']}: no window files in concepts dir")
+                    else:
+                        for wf in windows:
+                            try:
+                                with open(os.path.join(doc_concepts_dir, wf)) as f:
+                                    data = json.load(f)
+                                if not isinstance(data, list):
+                                    issues.append(f"[{h[:8]}] {wf}: not a JSON array")
+                            except json.JSONDecodeError:
+                                issues.append(f"[{h[:8]}] {wf}: invalid JSON")
+
+        # Check orphaned directories
+        if os.path.exists(text_dir):
+            doc_hashes = {d['hash'] for d in all_docs}
+            for dirname in os.listdir(text_dir):
+                if dirname not in doc_hashes:
+                    warnings.append(f"Orphaned text dir: {dirname}")
+
+        if os.path.exists(concepts_dir):
+            for dirname in os.listdir(concepts_dir):
+                if dirname not in doc_hashes:
+                    warnings.append(f"Orphaned concepts dir: {dirname}")
+
+        print(f"  Checked {len(all_docs)} documents")
+
+    # Connectivity checks
+    print("\n--- Connectivity ---\n")
+
+    import requests as http_requests
+
+    # Check TEI (primary embedding backend)
+    try:
+        tei_url = f"http://{config['embedding']['tei_host']}:{config['embedding']['tei_port']}/info"
+        resp = http_requests.get(tei_url, timeout=10)
+        if resp.status_code == 200:
+            print(f"  TEI: OK (bge-m3 at {config['embedding']['tei_host']}:{config['embedding']['tei_port']})")
+        else:
+            issues.append(f"TEI: HTTP {resp.status_code}")
+    except Exception as e:
+        issues.append(f"TEI: unreachable ({e})")
+
+    # Check Ollama (fallback)
+    try:
+        ollama_url = f"http://{config['embedding']['ollama_host']}:{config['embedding']['ollama_port']}/api/tags"
+        resp = http_requests.get(ollama_url, timeout=10)
+        if resp.status_code == 200:
+            print(f"  Ollama: OK (fallback at {config['embedding']['ollama_host']}:{config['embedding']['ollama_port']})")
+        else:
+            warnings.append(f"Ollama: HTTP {resp.status_code}")
+    except Exception as e:
+        warnings.append(f"Ollama: unreachable ({e}) — fallback only, not critical")
+
+    try:
+        from qdrant_client import QdrantClient
+        qdrant = QdrantClient(
+            host=config['vector_db']['host'],
+            port=config['vector_db']['port'],
+            timeout=10
+        )
+        collections = [c.name for c in qdrant.get_collections().collections]
+        target = config['vector_db']['collection']
+        if target in collections:
+            info = qdrant.get_collection(target)
+            print(f"  Qdrant: OK ({target}: {info.points_count} points)")
+        else:
+            issues.append(f"Qdrant: collection {target} not found")
+    except Exception as e:
+        issues.append(f"Qdrant: unreachable ({e})")
+
+    # Summary
+    print("\n--- Summary ---\n")
+
+    if warnings:
+        print(f"Warnings ({len(warnings)}):")
+        for w in warnings:
+            print(f"  ⚠ {w}")
+
+    if issues:
+        print(f"\nIssues ({len(issues)}):")
+        for i in issues:
+            print(f"  ✗ {i}")
+        print(f"\nValidation FAILED: {len(issues)} issue(s)")
+    else:
+        print("Validation PASSED")
+
+
+if __name__ == '__main__':
+    deep = '--deep' in sys.argv
+    run_validation(deep=deep)