implement three-tier cascade: Qdrant → Kiwix → SearXNG

- Add Kiwix integration with HTML parser for offline Wikipedia search - Add SearXNG integration for web search fallback - Cascade triggered when FlashRank top-1 score < 0.5 threshold - Context tagging: [DOMAIN_KNOWLEDGE], [OFFLINE_WIKI], [WEB_SEARCH] - Cascade decision logging to /opt/recon/logs/cascade.jsonl - Graceful degradation: skip unavailable tiers - Version bumped to 5.0.0 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-05-07 22:33:14 +00:00 · 2026-05-07 22:33:14 +00:00 · 81611110eb
commit 81611110eb
parent 5e5399de5c
1 changed files with 1653 additions and 1119 deletions
--- a/tools/recon_rag_tool.py
+++ b/tools/recon_rag_tool.py
@ -1,8 +1,8 @@
 """
 title: RECON Knowledge Base
 author: Echo6
-version: 4.3.0
+version: 5.0.0
-description: RAG filter that searches the RECON knowledge base and injects reference material into Aurora's context. Emits citations with PDF download links. Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.
+description: RAG filter with three-tier cascade: Qdrant (domain knowledge) → Kiwix (offline wiki) → SearXNG (web search). Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.
 """
 import logging
@ -10,8 +10,13 @@ import json
 import math
 import re
 import threading
 import html
 from datetime import datetime
 from html.parser import HTMLParser
 from pathlib import Path
 from typing import Optional, Callable, Awaitable
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from urllib.parse import quote, unquote
 import requests
 from pydantic import BaseModel, Field
@ -22,6 +27,26 @@ log = logging.getLogger(__name__)
 # even if OWI instantiates separate Filter objects per call.
 _SOURCE_STORE: dict[str, list] = {}
 # ── CASCADE CONFIGURATION (v5.0.0) ───────────────────────────────────────────
 # FlashRank score threshold for Tier 1 (Qdrant). Below this, fall through to Tier 2.
 # Based on calibration: RECON queries cluster at 0.95-1.0, misses below 0.3.
 # 0.5 is conservative - will let more through to Kiwix than strictly necessary.
 CASCADE_CONFIDENCE_THRESHOLD = 0.5
 # Kiwix-serve configuration
 KIWIX_BASE_URL = "http://localhost:8430"
 KIWIX_SEARCH_TIMEOUT = 5  # seconds
 KIWIX_ARTICLE_TIMEOUT = 5  # seconds
 KIWIX_MAX_RESULTS = 3
 # SearXNG configuration
 SEARXNG_URL = "http://192.168.1.102:8080"
 SEARXNG_TIMEOUT = 5  # seconds
 SEARXNG_MAX_RESULTS = 5
 # Cascade logging
 CASCADE_LOG_PATH = Path("/opt/recon/logs/cascade.jsonl")
 # ── Semantic Query Router (v4.3.0) ───────────────────────────────────────────
 ROUTE_EXAMPLES = {
    "nav_route": [
@ -359,6 +384,247 @@ def _address_book_lookup(query: str, address_book_url: str) -> dict | None:
 # ── End router/nav code ──────────────────────────────────────────────────────
 # ── Kiwix Search Helpers (v5.0.0) ────────────────────────────────────────────
 class _KiwixResultParser(HTMLParser):
    """Parse Kiwix search results HTML to extract articles."""
    def __init__(self):
        super().__init__()
        self.results = []
        self._in_results = False
        self._in_li = False
        self._in_cite = False
        self._in_info = False
        self._current = {}
        self._capture_text = False
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag == "div" and "results" in attrs_dict.get("class", ""):
            self._in_results = True
        elif self._in_results and tag == "li":
            self._in_li = True
            self._current = {"title": "", "url": "", "snippet": "", "word_count": ""}
        elif self._in_li and tag == "a" and not self._current.get("url"):
            self._current["url"] = attrs_dict.get("href", "")
            self._capture_text = True
        elif self._in_li and tag == "cite":
            self._in_cite = True
            self._capture_text = True
        elif self._in_li and tag == "div" and "informations" in attrs_dict.get("class", ""):
            self._in_info = True
            self._capture_text = True
    def handle_endtag(self, tag):
        if tag == "div" and self._in_results and not self._in_li:
            self._in_results = False
        elif tag == "li" and self._in_li:
            if self._current.get("url"):
                self.results.append(self._current)
            self._current = {}
            self._in_li = False
        elif tag == "a" and self._capture_text and not self._in_cite:
            self._capture_text = False
        elif tag == "cite":
            self._in_cite = False
            self._capture_text = False
        elif tag == "div" and self._in_info:
            self._in_info = False
            self._capture_text = False
    def handle_data(self, data):
        if self._capture_text and self._in_li:
            text = data.strip()
            if self._in_cite:
                self._current["snippet"] += text + " "
            elif self._in_info:
                self._current["word_count"] = text
            elif not self._current.get("title"):
                self._current["title"] = text
 def _strip_html_tags(html_content: str) -> str:
    """Simple HTML to plain text conversion using stdlib."""
    # Remove script and style elements
    text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Remove tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Decode entities
    text = html.unescape(text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 def _fetch_kiwix_books() -> list[str]:
    """Fetch list of available books from kiwix-serve catalog."""
    try:
        resp = requests.get(
            f"{KIWIX_BASE_URL}/catalog/v2/entries",
            timeout=KIWIX_SEARCH_TIMEOUT,
        )
        resp.raise_for_status()
        # Extract book names from href attributes
        books = re.findall(r'href="/content/([^"]+)"', resp.text)
        return list(set(books))  # dedupe
    except Exception as e:
        log.warning(f"Failed to fetch Kiwix book list: {e}")
        return []
 def _search_kiwix_book(book: str, query: str, limit: int = 5) -> list[dict]:
    """Search a single Kiwix book and return results."""
    try:
        resp = requests.get(
            f"{KIWIX_BASE_URL}/search",
            params={"content": book, "pattern": query, "limit": limit},
            timeout=KIWIX_SEARCH_TIMEOUT,
        )
        if resp.status_code != 200:
            return []
        parser = _KiwixResultParser()
        parser.feed(resp.text)
        # Add book name to results
        for r in parser.results:
            r["book"] = book
        return parser.results
    except Exception as e:
        log.warning(f"Kiwix search failed for {book}: {e}")
        return []
 def _fetch_kiwix_article(url_path: str) -> str:
    """Fetch and extract text content from a Kiwix article."""
    try:
        resp = requests.get(
            f"{KIWIX_BASE_URL}{url_path}",
            timeout=KIWIX_ARTICLE_TIMEOUT,
        )
        resp.raise_for_status()
        # Extract main content - try to find article body
        content = resp.text
        # Try to extract just the main content area
        main_match = re.search(r'<main[^>]*>(.*?)</main>', content, re.DOTALL | re.IGNORECASE)
        if main_match:
            content = main_match.group(1)
        else:
            # Try article tag
            article_match = re.search(r'<article[^>]*>(.*?)</article>', content, re.DOTALL | re.IGNORECASE)
            if article_match:
                content = article_match.group(1)
            else:
                # Try body content div
                body_match = re.search(r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>', content, re.DOTALL | re.IGNORECASE)
                if body_match:
                    content = body_match.group(1)
        return _strip_html_tags(content)[:4000]  # Limit to 4000 chars
    except Exception as e:
        log.warning(f"Failed to fetch Kiwix article {url_path}: {e}")
        return ""
 def _search_kiwix(query: str, books: list[str]) -> list[dict]:
    """Search Kiwix across specified books and return merged results."""
    all_results = []
    # Prioritize English Wikipedia and other English content
    priority_books = []
    other_books = []
    for book in books:
        if "wikipedia_en" in book or "_en_" in book or "_eng_" in book:
            priority_books.append(book)
        elif not any(lang in book for lang in ["_af_", "_de_", "_fr_", "_es_"]):
            other_books.append(book)
    # Search priority books first
    for book in priority_books[:3]:  # Limit to top 3 priority books
        results = _search_kiwix_book(book, query, limit=5)
        all_results.extend(results)
    # If not enough results, try other books
    if len(all_results) < KIWIX_MAX_RESULTS:
        for book in other_books[:2]:
            results = _search_kiwix_book(book, query, limit=3)
            all_results.extend(results)
    return all_results[:KIWIX_MAX_RESULTS * 2]  # Return up to 6 for further filtering
 # ── SearXNG Search Helpers (v5.0.0) ──────────────────────────────────────────
 def _search_searxng(query: str) -> list[dict]:
    """Search SearXNG and return results. Returns empty list on failure."""
    try:
        resp = requests.get(
            f"{SEARXNG_URL}/search",
            params={"q": query, "format": "json"},
            timeout=SEARXNG_TIMEOUT,
        )
        if resp.status_code != 200:
            log.warning(f"SearXNG returned status {resp.status_code}")
            return []
        data = resp.json()
        results = data.get("results", [])
        # Format results
        formatted = []
        for r in results[:SEARXNG_MAX_RESULTS]:
            formatted.append({
                "title": r.get("title", ""),
                "url": r.get("url", ""),
                "snippet": r.get("content", ""),
                "engines": r.get("engines", []),
                "score": r.get("score", 0),
            })
        return formatted
    except requests.Timeout:
        log.warning("SearXNG request timed out (offline or slow)")
        return []
    except requests.ConnectionError:
        log.warning("SearXNG connection failed (offline)")
        return []
    except Exception as e:
        log.warning(f"SearXNG search failed: {e}")
        return []
 # ── Cascade Logging (v5.0.0) ─────────────────────────────────────────────────
 def _log_cascade_decision(
    query: str,
    router_intent: str,
    top_1_score: float,
    tier_used: int,
    num_results: int,
 ):
    """Log cascade decision to JSONL file for threshold tuning."""
    try:
        CASCADE_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
        entry = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
            "query": query,
            "router_intent": router_intent,
            "top_1_score": round(top_1_score, 4),
            "tier_used": tier_used,
            "num_results": num_results,
        }
        with open(CASCADE_LOG_PATH, "a") as f:
            f.write(json.dumps(entry) + "\n")
    except Exception as e:
        log.warning(f"Failed to log cascade decision: {e}")
 # ── End cascade helpers ──────────────────────────────────────────────────────
 # Subdomains excluded from Medical results when tactical context detected
 _OBSTETRIC_SUBDOMAINS = [
    "Obstetrics", "Midwifery", "Pregnancy", "Pregnancy Care",
@ -493,11 +759,27 @@ class Filter:
            default="http://100.64.0.24:8420",
            description="RECON address book API base URL",
        )
        cascade_enabled: bool = Field(
            default=True,
            description="Enable three-tier cascade (Qdrant → Kiwix → SearXNG)",
        )
        cascade_threshold: float = Field(
            default=0.5,
            description="FlashRank score threshold for cascade fallthrough",
        )
    def __init__(self):
        self.valves = self.Valves()
        self._expansion_cache: dict[str, list[str]] = {}
        self._ranker = None
        self._kiwix_books: list[str] | None = None
    def _get_kiwix_books(self) -> list[str]:
        """Get cached list of Kiwix books, fetching on first use."""
        if self._kiwix_books is None:
            self._kiwix_books = _fetch_kiwix_books()
            log.info(f"Loaded {len(self._kiwix_books)} Kiwix books")
        return self._kiwix_books
    def _embed_query(self, text: str) -> list:
        """Embed a query string using TEI."""
@ -544,7 +826,7 @@ class Filter:
        for item in ranked[:self.valves.rerank_top_n]:
            idx = item["id"]
            result_copy = dict(results[idx])
-            result_copy["score"] = item["score"]
+            result_copy["score"] = float(item["score"])
            reranked.append(result_copy)
        return reranked
@ -756,7 +1038,7 @@ class Filter:
                    log.warning(f"Expanded search for {term!r} failed: {e}")
        return results
-    def _format_context(self, results: list) -> str:
+    def _format_context(self, results: list, tier_tag: str = "DOMAIN_KNOWLEDGE") -> str:
        """Format search results into a context block for the system prompt."""
        if not results:
            return ""
@ -814,7 +1096,69 @@ class Filter:
            else:
                dl_str = ""
-            block = f"[{i}] {citation} (relevance: {score:.2f})\n{summary}{facts_str}{domain_str}{dl_str}"
+            block = f"[{tier_tag}:{i}] {citation} (relevance: {score:.2f})\n{summary}{facts_str}{domain_str}{dl_str}"
            blocks.append(block)
        return "\n\n".join(blocks)
    def _format_kiwix_context(self, results: list[dict]) -> str:
        """Format Kiwix search results into a context block."""
        if not results:
            return ""
        blocks = []
        for i, r in enumerate(results, 1):
            title = r.get("title", "Unknown")
            snippet = r.get("snippet", "").strip()
            book = r.get("book", "")
            url_path = r.get("url", "")
            # Build wiki URL
            if url_path:
                # Extract article path from /content/book/path
                path_match = re.search(r'/content/[^/]+/(.+)$', url_path)
                if path_match:
                    article_path = path_match.group(1)
                    wiki_url = f"https://wiki.echo6.co/viewer#{book}/{article_path}"
                else:
                    wiki_url = f"https://wiki.echo6.co/viewer#{book}"
            else:
                wiki_url = ""
            # Fetch article content if available
            content = ""
            if url_path:
                content = _fetch_kiwix_article(url_path)
                if content:
                    content = content[:1500]  # Limit per article
            if not content:
                content = snippet
            block = f"[OFFLINE_WIKI:{i}] {title}\n{content}"
            if wiki_url:
                block += f"\nSource: {wiki_url}"
            blocks.append(block)
        return "\n\n".join(blocks)
    def _format_searxng_context(self, results: list[dict]) -> str:
        """Format SearXNG search results into a context block."""
        if not results:
            return ""
        blocks = []
        for i, r in enumerate(results, 1):
            title = r.get("title", "Unknown")
            snippet = r.get("snippet", "")
            url = r.get("url", "")
            engines = r.get("engines", [])
            engine_str = f" (via {', '.join(engines[:2])})" if engines else ""
            block = f"[WEB_SEARCH:{i}] {title}{engine_str}\n{snippet}"
            if url:
                block += f"\nSource: {url}"
            blocks.append(block)
        return "\n\n".join(blocks)
@ -838,11 +1182,14 @@ class Filter:
        if not query or len(query.strip()) < 3:
            return body
        router_intent = "rag_search"
        # ── ROUTER GATE (v4.3.0) ─────────────────────────────────────────
        if self.valves.router_enabled:
            route, confidence = _classify_query(
                query, self.valves.tei_url, self.valves.router_threshold
            )
            router_intent = route
            log.info(f"Router: {query!r} → {route} ({confidence:.3f})")
            if route == "direct_answer":
@ -903,6 +1250,11 @@ class Filter:
                }
            )
        tier_used = 1
        top_1_score = 0.0
        final_context = ""
        final_results = []
        try:
            vector = self._embed_query(query)
@ -975,31 +1327,110 @@ class Filter:
                results = _rerank_by_keyword_overlap(query, results)
                results = results[:self.valves.top_k]
-            # Store results for outlet citations (module-level, keyed by chat_id)
+            # Get top-1 score for cascade decision
            top_1_score = results[0]["score"] if results else 0.0
            # ── CASCADE DECISION POINT (v5.0.0) ──────────────────────────────
            if self.valves.cascade_enabled and top_1_score < self.valves.cascade_threshold:
                # Tier 1 score too low, try Tier 2 (Kiwix)
                log.info(f"Cascade: Tier 1 score {top_1_score:.3f} < {self.valves.cascade_threshold}, trying Kiwix")
                if __event_emitter__:
                    await __event_emitter__(
                        {"type": "status", "data": {"description": "Searching offline encyclopedia...", "done": False}}
                    )
                kiwix_results = _search_kiwix(query, self._get_kiwix_books())
                if kiwix_results:
                    tier_used = 2
                    final_context = self._format_kiwix_context(kiwix_results[:KIWIX_MAX_RESULTS])
                    log.info(f"Cascade: Tier 2 (Kiwix) returned {len(kiwix_results)} results")
                else:
                    # Tier 2 failed, try Tier 3 (SearXNG)
                    log.info("Cascade: Tier 2 empty, trying SearXNG")
                    if __event_emitter__:
                        await __event_emitter__(
                            {"type": "status", "data": {"description": "Searching the web...", "done": False}}
                        )
                    searxng_results = _search_searxng(query)
                    if searxng_results:
                        tier_used = 3
                        final_context = self._format_searxng_context(searxng_results)
                        log.info(f"Cascade: Tier 3 (SearXNG) returned {len(searxng_results)} results")
                    else:
                        # All tiers exhausted, fall back to whatever Tier 1 had
                        log.info("Cascade: All tiers exhausted, using Tier 1 results")
                        tier_used = 1
                        final_context = self._format_context(results, "DOMAIN_KNOWLEDGE")
                        final_results = results
            else:
                # Tier 1 score good enough, use Qdrant results
                tier_used = 1
                final_context = self._format_context(results, "DOMAIN_KNOWLEDGE")
                final_results = results
            # Store results for outlet citations (only for Tier 1)
            if tier_used == 1:
                chat_id = body.get("chat_id", body.get("metadata", {}).get("chat_id", ""))
                if chat_id:
-                _SOURCE_STORE[chat_id] = results
+                    _SOURCE_STORE[chat_id] = final_results
-            # Build context block
+            # Log cascade decision
-            context = self._format_context(results)
+            _log_cascade_decision(
                query=query,
                router_intent=router_intent,
                top_1_score=top_1_score,
                tier_used=tier_used,
                num_results=len(results) if tier_used == 1 else (len(kiwix_results) if tier_used == 2 else len(searxng_results) if tier_used == 3 else 0),
            )
-            if context:
+            # Build the RAG prompt with tier-appropriate instructions
            if final_context:
                if tier_used == 1:
                    rag_prompt = (
                        "You have access to the RECON knowledge base — a curated library of military field manuals, "
                        "survival guides, preparedness literature, and video transcripts. Answer the user's question using "
-                    "the reference material below. Reference sources using [1], [2], [3] etc. matching the "
+                        "the reference material below. Reference sources using [DOMAIN_KNOWLEDGE:1], [DOMAIN_KNOWLEDGE:2], etc.\n\n"
                    "numbered sources provided. Use these numbers inline in your response.\n\n"
                        "If the reference material doesn't adequately answer the question, say so explicitly rather "
                        "than filling gaps with general knowledge.\n\n"
                        "---REFERENCE MATERIAL---\n\n"
-                    f"{context}\n\n"
+                        f"{final_context}\n\n"
                        "---END REFERENCE MATERIAL---"
                    )
                elif tier_used == 2:
                    rag_prompt = (
                        "The RECON domain knowledge base did not have high-confidence results for this query. "
                        "The following information comes from offline Wikipedia/encyclopedia sources (Kiwix). "
                        "Reference sources using [OFFLINE_WIKI:1], [OFFLINE_WIKI:2], etc.\n\n"
                        "Note: This is general encyclopedia content, not domain-specific preparedness material.\n\n"
                        "---OFFLINE WIKI CONTENT---\n\n"
                        f"{final_context}\n\n"
                        "---END OFFLINE WIKI CONTENT---"
                    )
                else:  # tier_used == 3
                    rag_prompt = (
                        "Neither the RECON knowledge base nor offline encyclopedias had relevant content. "
                        "The following information comes from a live web search. Reference sources using [WEB_SEARCH:1], etc.\n\n"
                        "Note: Web search results may be less reliable than curated sources. Verify important information.\n\n"
                        "---WEB SEARCH RESULTS---\n\n"
                        f"{final_context}\n\n"
                        "---END WEB SEARCH RESULTS---"
                    )
            else:
                rag_prompt = (
                    "You have access to the RECON knowledge base, but no relevant reference material was "
-                    "found for this query. Answer from your general knowledge and clearly flag that your "
+                    "found for this query in any tier (domain knowledge, offline wiki, or web search). "
-                    "response is NOT backed by the RECON reference library."
+                    "Answer from your general knowledge and clearly flag that your response is NOT backed by references."
                )
            # Add source priority instruction
            rag_prompt += (
                "\n\nSource priority: When sources overlap, prefer DOMAIN_KNOWLEDGE over OFFLINE_WIKI over WEB_SEARCH. "
                "Always cite which tier your information came from."
            )
            # Inject into system message
@ -1013,8 +1444,10 @@ class Filter:
                    0, {"role": "system", "content": rag_prompt}
                )
            # Emit final status
            if __event_emitter__:
-                status_msg = f"Found {len(results)} reference{'s' if len(results) != 1 else ''}" if results else "No matching references found"
+                tier_names = {1: "RECON", 2: "Kiwix", 3: "Web"}
                status_msg = f"Found results from {tier_names.get(tier_used, 'unknown')} (Tier {tier_used})"
                await __event_emitter__(
                    {
                        "type": "status",
@ -1117,3 +1550,104 @@ class Filter:
                log.warning(f"Failed to emit citation (id={pid}): {e}")
        return body
 # ── TEST BLOCK ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    import asyncio
    # Test queries for each tier
    TEST_QUERIES = [
        ("tourniquet application steps", "Should hit Tier 1 (RECON)"),
        ("population of Ukraine", "Should hit Tier 2 (Kiwix)"),
        ("history of the Winter War between Finland and Russia", "Should hit Tier 2 (Kiwix)"),
        ("latest iPhone reviews 2026", "Should hit Tier 3 (SearXNG)"),
        ("compass declination adjustment", "Should hit Tier 1 (RECON)"),
        ("what is the Coriolis effect", "Could go either way"),
    ]
    async def run_tests():
        f = Filter()
        results = []
        print("=" * 70)
        print("CASCADE TEST RESULTS")
        print("=" * 70)
        for query, expected in TEST_QUERIES:
            print(f"\n{'─' * 70}")
            print(f"Query: {query}")
            print(f"Expected: {expected}")
            print("─" * 70)
            # Simulate a request body
            body = {
                "messages": [
                    {"role": "user", "content": query}
                ],
                "chat_id": f"test_{hash(query)}",
            }
            try:
                # Run through inlet
                result_body = await f.inlet(body)
                # Extract what was injected
                system_msg = next(
                    (m for m in result_body.get("messages", []) if m.get("role") == "system"),
                    None
                )
                if system_msg:
                    content = system_msg.get("content", "")
                    # Determine tier used
                    if "[DOMAIN_KNOWLEDGE:" in content:
                        tier = 1
                    elif "[OFFLINE_WIKI:" in content:
                        tier = 2
                    elif "[WEB_SEARCH:" in content:
                        tier = 3
                    else:
                        tier = 0
                    print(f"Tier Used: {tier}")
                    # Get first 200 chars of context
                    context_start = content.find("---")
                    if context_start > 0:
                        context_preview = content[context_start:context_start+300]
                        print(f"Context Preview: {context_preview[:200]}...")
                    results.append({
                        "query": query,
                        "expected": expected,
                        "tier": tier,
                    })
                else:
                    print("No system message injected")
                    results.append({
                        "query": query,
                        "expected": expected,
                        "tier": None,
                    })
            except Exception as e:
                print(f"ERROR: {e}")
                results.append({
                    "query": query,
                    "expected": expected,
                    "tier": None,
                    "error": str(e),
                })
        print("\n" + "=" * 70)
        print("SUMMARY")
        print("=" * 70)
        for r in results:
            tier_str = f"Tier {r['tier']}" if r.get('tier') else "ERROR"
            print(f"  {r['query'][:40]:<40} → {tier_str}")
        return results
    asyncio.run(run_tests())