implement three-tier cascade: Qdrant → Kiwix → SearXNG

- Add Kiwix integration with HTML parser for offline Wikipedia search
- Add SearXNG integration for web search fallback
- Cascade triggered when FlashRank top-1 score < 0.5 threshold
- Context tagging: [DOMAIN_KNOWLEDGE], [OFFLINE_WIKI], [WEB_SEARCH]
- Cascade decision logging to /opt/recon/logs/cascade.jsonl
- Graceful degradation: skip unavailable tiers
- Version bumped to 5.0.0

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-05-07 22:33:14 +00:00
commit 81611110eb

View file

@ -1,8 +1,8 @@
"""
title: RECON Knowledge Base
author: Echo6
version: 4.3.0
description: RAG filter that searches the RECON knowledge base and injects reference material into Aurora's context. Emits citations with PDF download links. Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.
version: 5.0.0
description: RAG filter with three-tier cascade: Qdrant (domain knowledge) Kiwix (offline wiki) SearXNG (web search). Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.
"""
import logging
@ -10,8 +10,13 @@ import json
import math
import re
import threading
import html
from datetime import datetime
from html.parser import HTMLParser
from pathlib import Path
from typing import Optional, Callable, Awaitable
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import quote, unquote
import requests
from pydantic import BaseModel, Field
@ -22,6 +27,26 @@ log = logging.getLogger(__name__)
# even if OWI instantiates separate Filter objects per call.
_SOURCE_STORE: dict[str, list] = {}
# ── CASCADE CONFIGURATION (v5.0.0) ───────────────────────────────────────────
# FlashRank score threshold for Tier 1 (Qdrant). Below this, fall through to Tier 2.
# Based on calibration: RECON queries cluster at 0.95-1.0, misses below 0.3.
# 0.5 is conservative - will let more through to Kiwix than strictly necessary.
CASCADE_CONFIDENCE_THRESHOLD = 0.5
# Kiwix-serve configuration
KIWIX_BASE_URL = "http://localhost:8430"
KIWIX_SEARCH_TIMEOUT = 5 # seconds
KIWIX_ARTICLE_TIMEOUT = 5 # seconds
KIWIX_MAX_RESULTS = 3
# SearXNG configuration
SEARXNG_URL = "http://192.168.1.102:8080"
SEARXNG_TIMEOUT = 5 # seconds
SEARXNG_MAX_RESULTS = 5
# Cascade logging
CASCADE_LOG_PATH = Path("/opt/recon/logs/cascade.jsonl")
# ── Semantic Query Router (v4.3.0) ───────────────────────────────────────────
ROUTE_EXAMPLES = {
"nav_route": [
@ -359,6 +384,247 @@ def _address_book_lookup(query: str, address_book_url: str) -> dict | None:
# ── End router/nav code ──────────────────────────────────────────────────────
# ── Kiwix Search Helpers (v5.0.0) ────────────────────────────────────────────
class _KiwixResultParser(HTMLParser):
"""Parse Kiwix search results HTML to extract articles."""
def __init__(self):
super().__init__()
self.results = []
self._in_results = False
self._in_li = False
self._in_cite = False
self._in_info = False
self._current = {}
self._capture_text = False
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == "div" and "results" in attrs_dict.get("class", ""):
self._in_results = True
elif self._in_results and tag == "li":
self._in_li = True
self._current = {"title": "", "url": "", "snippet": "", "word_count": ""}
elif self._in_li and tag == "a" and not self._current.get("url"):
self._current["url"] = attrs_dict.get("href", "")
self._capture_text = True
elif self._in_li and tag == "cite":
self._in_cite = True
self._capture_text = True
elif self._in_li and tag == "div" and "informations" in attrs_dict.get("class", ""):
self._in_info = True
self._capture_text = True
def handle_endtag(self, tag):
if tag == "div" and self._in_results and not self._in_li:
self._in_results = False
elif tag == "li" and self._in_li:
if self._current.get("url"):
self.results.append(self._current)
self._current = {}
self._in_li = False
elif tag == "a" and self._capture_text and not self._in_cite:
self._capture_text = False
elif tag == "cite":
self._in_cite = False
self._capture_text = False
elif tag == "div" and self._in_info:
self._in_info = False
self._capture_text = False
def handle_data(self, data):
if self._capture_text and self._in_li:
text = data.strip()
if self._in_cite:
self._current["snippet"] += text + " "
elif self._in_info:
self._current["word_count"] = text
elif not self._current.get("title"):
self._current["title"] = text
def _strip_html_tags(html_content: str) -> str:
"""Simple HTML to plain text conversion using stdlib."""
# Remove script and style elements
text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Remove tags
text = re.sub(r'<[^>]+>', ' ', text)
# Decode entities
text = html.unescape(text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def _fetch_kiwix_books() -> list[str]:
"""Fetch list of available books from kiwix-serve catalog."""
try:
resp = requests.get(
f"{KIWIX_BASE_URL}/catalog/v2/entries",
timeout=KIWIX_SEARCH_TIMEOUT,
)
resp.raise_for_status()
# Extract book names from href attributes
books = re.findall(r'href="/content/([^"]+)"', resp.text)
return list(set(books)) # dedupe
except Exception as e:
log.warning(f"Failed to fetch Kiwix book list: {e}")
return []
def _search_kiwix_book(book: str, query: str, limit: int = 5) -> list[dict]:
"""Search a single Kiwix book and return results."""
try:
resp = requests.get(
f"{KIWIX_BASE_URL}/search",
params={"content": book, "pattern": query, "limit": limit},
timeout=KIWIX_SEARCH_TIMEOUT,
)
if resp.status_code != 200:
return []
parser = _KiwixResultParser()
parser.feed(resp.text)
# Add book name to results
for r in parser.results:
r["book"] = book
return parser.results
except Exception as e:
log.warning(f"Kiwix search failed for {book}: {e}")
return []
def _fetch_kiwix_article(url_path: str) -> str:
"""Fetch and extract text content from a Kiwix article."""
try:
resp = requests.get(
f"{KIWIX_BASE_URL}{url_path}",
timeout=KIWIX_ARTICLE_TIMEOUT,
)
resp.raise_for_status()
# Extract main content - try to find article body
content = resp.text
# Try to extract just the main content area
main_match = re.search(r'<main[^>]*>(.*?)</main>', content, re.DOTALL | re.IGNORECASE)
if main_match:
content = main_match.group(1)
else:
# Try article tag
article_match = re.search(r'<article[^>]*>(.*?)</article>', content, re.DOTALL | re.IGNORECASE)
if article_match:
content = article_match.group(1)
else:
# Try body content div
body_match = re.search(r'<div[^>]*class="[^"]*content[^"]*"[^>]*>(.*?)</div>', content, re.DOTALL | re.IGNORECASE)
if body_match:
content = body_match.group(1)
return _strip_html_tags(content)[:4000] # Limit to 4000 chars
except Exception as e:
log.warning(f"Failed to fetch Kiwix article {url_path}: {e}")
return ""
def _search_kiwix(query: str, books: list[str]) -> list[dict]:
"""Search Kiwix across specified books and return merged results."""
all_results = []
# Prioritize English Wikipedia and other English content
priority_books = []
other_books = []
for book in books:
if "wikipedia_en" in book or "_en_" in book or "_eng_" in book:
priority_books.append(book)
elif not any(lang in book for lang in ["_af_", "_de_", "_fr_", "_es_"]):
other_books.append(book)
# Search priority books first
for book in priority_books[:3]: # Limit to top 3 priority books
results = _search_kiwix_book(book, query, limit=5)
all_results.extend(results)
# If not enough results, try other books
if len(all_results) < KIWIX_MAX_RESULTS:
for book in other_books[:2]:
results = _search_kiwix_book(book, query, limit=3)
all_results.extend(results)
return all_results[:KIWIX_MAX_RESULTS * 2] # Return up to 6 for further filtering
# ── SearXNG Search Helpers (v5.0.0) ──────────────────────────────────────────
def _search_searxng(query: str) -> list[dict]:
"""Search SearXNG and return results. Returns empty list on failure."""
try:
resp = requests.get(
f"{SEARXNG_URL}/search",
params={"q": query, "format": "json"},
timeout=SEARXNG_TIMEOUT,
)
if resp.status_code != 200:
log.warning(f"SearXNG returned status {resp.status_code}")
return []
data = resp.json()
results = data.get("results", [])
# Format results
formatted = []
for r in results[:SEARXNG_MAX_RESULTS]:
formatted.append({
"title": r.get("title", ""),
"url": r.get("url", ""),
"snippet": r.get("content", ""),
"engines": r.get("engines", []),
"score": r.get("score", 0),
})
return formatted
except requests.Timeout:
log.warning("SearXNG request timed out (offline or slow)")
return []
except requests.ConnectionError:
log.warning("SearXNG connection failed (offline)")
return []
except Exception as e:
log.warning(f"SearXNG search failed: {e}")
return []
# ── Cascade Logging (v5.0.0) ─────────────────────────────────────────────────
def _log_cascade_decision(
query: str,
router_intent: str,
top_1_score: float,
tier_used: int,
num_results: int,
):
"""Log cascade decision to JSONL file for threshold tuning."""
try:
CASCADE_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
entry = {
"timestamp": datetime.utcnow().isoformat() + "Z",
"query": query,
"router_intent": router_intent,
"top_1_score": round(top_1_score, 4),
"tier_used": tier_used,
"num_results": num_results,
}
with open(CASCADE_LOG_PATH, "a") as f:
f.write(json.dumps(entry) + "\n")
except Exception as e:
log.warning(f"Failed to log cascade decision: {e}")
# ── End cascade helpers ──────────────────────────────────────────────────────
# Subdomains excluded from Medical results when tactical context detected
_OBSTETRIC_SUBDOMAINS = [
"Obstetrics", "Midwifery", "Pregnancy", "Pregnancy Care",
@ -493,11 +759,27 @@ class Filter:
default="http://100.64.0.24:8420",
description="RECON address book API base URL",
)
cascade_enabled: bool = Field(
default=True,
description="Enable three-tier cascade (Qdrant → Kiwix → SearXNG)",
)
cascade_threshold: float = Field(
default=0.5,
description="FlashRank score threshold for cascade fallthrough",
)
def __init__(self):
self.valves = self.Valves()
self._expansion_cache: dict[str, list[str]] = {}
self._ranker = None
self._kiwix_books: list[str] | None = None
def _get_kiwix_books(self) -> list[str]:
"""Get cached list of Kiwix books, fetching on first use."""
if self._kiwix_books is None:
self._kiwix_books = _fetch_kiwix_books()
log.info(f"Loaded {len(self._kiwix_books)} Kiwix books")
return self._kiwix_books
def _embed_query(self, text: str) -> list:
"""Embed a query string using TEI."""
@ -544,7 +826,7 @@ class Filter:
for item in ranked[:self.valves.rerank_top_n]:
idx = item["id"]
result_copy = dict(results[idx])
result_copy["score"] = item["score"]
result_copy["score"] = float(item["score"])
reranked.append(result_copy)
return reranked
@ -756,7 +1038,7 @@ class Filter:
log.warning(f"Expanded search for {term!r} failed: {e}")
return results
def _format_context(self, results: list) -> str:
def _format_context(self, results: list, tier_tag: str = "DOMAIN_KNOWLEDGE") -> str:
"""Format search results into a context block for the system prompt."""
if not results:
return ""
@ -814,7 +1096,69 @@ class Filter:
else:
dl_str = ""
block = f"[{i}] {citation} (relevance: {score:.2f})\n{summary}{facts_str}{domain_str}{dl_str}"
block = f"[{tier_tag}:{i}] {citation} (relevance: {score:.2f})\n{summary}{facts_str}{domain_str}{dl_str}"
blocks.append(block)
return "\n\n".join(blocks)
def _format_kiwix_context(self, results: list[dict]) -> str:
"""Format Kiwix search results into a context block."""
if not results:
return ""
blocks = []
for i, r in enumerate(results, 1):
title = r.get("title", "Unknown")
snippet = r.get("snippet", "").strip()
book = r.get("book", "")
url_path = r.get("url", "")
# Build wiki URL
if url_path:
# Extract article path from /content/book/path
path_match = re.search(r'/content/[^/]+/(.+)$', url_path)
if path_match:
article_path = path_match.group(1)
wiki_url = f"https://wiki.echo6.co/viewer#{book}/{article_path}"
else:
wiki_url = f"https://wiki.echo6.co/viewer#{book}"
else:
wiki_url = ""
# Fetch article content if available
content = ""
if url_path:
content = _fetch_kiwix_article(url_path)
if content:
content = content[:1500] # Limit per article
if not content:
content = snippet
block = f"[OFFLINE_WIKI:{i}] {title}\n{content}"
if wiki_url:
block += f"\nSource: {wiki_url}"
blocks.append(block)
return "\n\n".join(blocks)
def _format_searxng_context(self, results: list[dict]) -> str:
"""Format SearXNG search results into a context block."""
if not results:
return ""
blocks = []
for i, r in enumerate(results, 1):
title = r.get("title", "Unknown")
snippet = r.get("snippet", "")
url = r.get("url", "")
engines = r.get("engines", [])
engine_str = f" (via {', '.join(engines[:2])})" if engines else ""
block = f"[WEB_SEARCH:{i}] {title}{engine_str}\n{snippet}"
if url:
block += f"\nSource: {url}"
blocks.append(block)
return "\n\n".join(blocks)
@ -838,11 +1182,14 @@ class Filter:
if not query or len(query.strip()) < 3:
return body
router_intent = "rag_search"
# ── ROUTER GATE (v4.3.0) ─────────────────────────────────────────
if self.valves.router_enabled:
route, confidence = _classify_query(
query, self.valves.tei_url, self.valves.router_threshold
)
router_intent = route
log.info(f"Router: {query!r}{route} ({confidence:.3f})")
if route == "direct_answer":
@ -903,6 +1250,11 @@ class Filter:
}
)
tier_used = 1
top_1_score = 0.0
final_context = ""
final_results = []
try:
vector = self._embed_query(query)
@ -975,33 +1327,112 @@ class Filter:
results = _rerank_by_keyword_overlap(query, results)
results = results[:self.valves.top_k]
# Store results for outlet citations (module-level, keyed by chat_id)
chat_id = body.get("chat_id", body.get("metadata", {}).get("chat_id", ""))
if chat_id:
_SOURCE_STORE[chat_id] = results
# Get top-1 score for cascade decision
top_1_score = results[0]["score"] if results else 0.0
# Build context block
context = self._format_context(results)
# ── CASCADE DECISION POINT (v5.0.0) ──────────────────────────────
if self.valves.cascade_enabled and top_1_score < self.valves.cascade_threshold:
# Tier 1 score too low, try Tier 2 (Kiwix)
log.info(f"Cascade: Tier 1 score {top_1_score:.3f} < {self.valves.cascade_threshold}, trying Kiwix")
if context:
rag_prompt = (
"You have access to the RECON knowledge base — a curated library of military field manuals, "
"survival guides, preparedness literature, and video transcripts. Answer the user's question using "
"the reference material below. Reference sources using [1], [2], [3] etc. matching the "
"numbered sources provided. Use these numbers inline in your response.\n\n"
"If the reference material doesn't adequately answer the question, say so explicitly rather "
"than filling gaps with general knowledge.\n\n"
"---REFERENCE MATERIAL---\n\n"
f"{context}\n\n"
"---END REFERENCE MATERIAL---"
)
if __event_emitter__:
await __event_emitter__(
{"type": "status", "data": {"description": "Searching offline encyclopedia...", "done": False}}
)
kiwix_results = _search_kiwix(query, self._get_kiwix_books())
if kiwix_results:
tier_used = 2
final_context = self._format_kiwix_context(kiwix_results[:KIWIX_MAX_RESULTS])
log.info(f"Cascade: Tier 2 (Kiwix) returned {len(kiwix_results)} results")
else:
# Tier 2 failed, try Tier 3 (SearXNG)
log.info("Cascade: Tier 2 empty, trying SearXNG")
if __event_emitter__:
await __event_emitter__(
{"type": "status", "data": {"description": "Searching the web...", "done": False}}
)
searxng_results = _search_searxng(query)
if searxng_results:
tier_used = 3
final_context = self._format_searxng_context(searxng_results)
log.info(f"Cascade: Tier 3 (SearXNG) returned {len(searxng_results)} results")
else:
# All tiers exhausted, fall back to whatever Tier 1 had
log.info("Cascade: All tiers exhausted, using Tier 1 results")
tier_used = 1
final_context = self._format_context(results, "DOMAIN_KNOWLEDGE")
final_results = results
else:
# Tier 1 score good enough, use Qdrant results
tier_used = 1
final_context = self._format_context(results, "DOMAIN_KNOWLEDGE")
final_results = results
# Store results for outlet citations (only for Tier 1)
if tier_used == 1:
chat_id = body.get("chat_id", body.get("metadata", {}).get("chat_id", ""))
if chat_id:
_SOURCE_STORE[chat_id] = final_results
# Log cascade decision
_log_cascade_decision(
query=query,
router_intent=router_intent,
top_1_score=top_1_score,
tier_used=tier_used,
num_results=len(results) if tier_used == 1 else (len(kiwix_results) if tier_used == 2 else len(searxng_results) if tier_used == 3 else 0),
)
# Build the RAG prompt with tier-appropriate instructions
if final_context:
if tier_used == 1:
rag_prompt = (
"You have access to the RECON knowledge base — a curated library of military field manuals, "
"survival guides, preparedness literature, and video transcripts. Answer the user's question using "
"the reference material below. Reference sources using [DOMAIN_KNOWLEDGE:1], [DOMAIN_KNOWLEDGE:2], etc.\n\n"
"If the reference material doesn't adequately answer the question, say so explicitly rather "
"than filling gaps with general knowledge.\n\n"
"---REFERENCE MATERIAL---\n\n"
f"{final_context}\n\n"
"---END REFERENCE MATERIAL---"
)
elif tier_used == 2:
rag_prompt = (
"The RECON domain knowledge base did not have high-confidence results for this query. "
"The following information comes from offline Wikipedia/encyclopedia sources (Kiwix). "
"Reference sources using [OFFLINE_WIKI:1], [OFFLINE_WIKI:2], etc.\n\n"
"Note: This is general encyclopedia content, not domain-specific preparedness material.\n\n"
"---OFFLINE WIKI CONTENT---\n\n"
f"{final_context}\n\n"
"---END OFFLINE WIKI CONTENT---"
)
else: # tier_used == 3
rag_prompt = (
"Neither the RECON knowledge base nor offline encyclopedias had relevant content. "
"The following information comes from a live web search. Reference sources using [WEB_SEARCH:1], etc.\n\n"
"Note: Web search results may be less reliable than curated sources. Verify important information.\n\n"
"---WEB SEARCH RESULTS---\n\n"
f"{final_context}\n\n"
"---END WEB SEARCH RESULTS---"
)
else:
rag_prompt = (
"You have access to the RECON knowledge base, but no relevant reference material was "
"found for this query. Answer from your general knowledge and clearly flag that your "
"response is NOT backed by the RECON reference library."
"found for this query in any tier (domain knowledge, offline wiki, or web search). "
"Answer from your general knowledge and clearly flag that your response is NOT backed by references."
)
# Add source priority instruction
rag_prompt += (
"\n\nSource priority: When sources overlap, prefer DOMAIN_KNOWLEDGE over OFFLINE_WIKI over WEB_SEARCH. "
"Always cite which tier your information came from."
)
# Inject into system message
system_msg = next(
(m for m in messages if m.get("role") == "system"), None
@ -1013,8 +1444,10 @@ class Filter:
0, {"role": "system", "content": rag_prompt}
)
# Emit final status
if __event_emitter__:
status_msg = f"Found {len(results)} reference{'s' if len(results) != 1 else ''}" if results else "No matching references found"
tier_names = {1: "RECON", 2: "Kiwix", 3: "Web"}
status_msg = f"Found results from {tier_names.get(tier_used, 'unknown')} (Tier {tier_used})"
await __event_emitter__(
{
"type": "status",
@ -1117,3 +1550,104 @@ class Filter:
log.warning(f"Failed to emit citation (id={pid}): {e}")
return body
# ── TEST BLOCK ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
import asyncio
# Test queries for each tier
TEST_QUERIES = [
("tourniquet application steps", "Should hit Tier 1 (RECON)"),
("population of Ukraine", "Should hit Tier 2 (Kiwix)"),
("history of the Winter War between Finland and Russia", "Should hit Tier 2 (Kiwix)"),
("latest iPhone reviews 2026", "Should hit Tier 3 (SearXNG)"),
("compass declination adjustment", "Should hit Tier 1 (RECON)"),
("what is the Coriolis effect", "Could go either way"),
]
async def run_tests():
f = Filter()
results = []
print("=" * 70)
print("CASCADE TEST RESULTS")
print("=" * 70)
for query, expected in TEST_QUERIES:
print(f"\n{'' * 70}")
print(f"Query: {query}")
print(f"Expected: {expected}")
print("" * 70)
# Simulate a request body
body = {
"messages": [
{"role": "user", "content": query}
],
"chat_id": f"test_{hash(query)}",
}
try:
# Run through inlet
result_body = await f.inlet(body)
# Extract what was injected
system_msg = next(
(m for m in result_body.get("messages", []) if m.get("role") == "system"),
None
)
if system_msg:
content = system_msg.get("content", "")
# Determine tier used
if "[DOMAIN_KNOWLEDGE:" in content:
tier = 1
elif "[OFFLINE_WIKI:" in content:
tier = 2
elif "[WEB_SEARCH:" in content:
tier = 3
else:
tier = 0
print(f"Tier Used: {tier}")
# Get first 200 chars of context
context_start = content.find("---")
if context_start > 0:
context_preview = content[context_start:context_start+300]
print(f"Context Preview: {context_preview[:200]}...")
results.append({
"query": query,
"expected": expected,
"tier": tier,
})
else:
print("No system message injected")
results.append({
"query": query,
"expected": expected,
"tier": None,
})
except Exception as e:
print(f"ERROR: {e}")
results.append({
"query": query,
"expected": expected,
"tier": None,
"error": str(e),
})
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
for r in results:
tier_str = f"Tier {r['tier']}" if r.get('tier') else "ERROR"
print(f" {r['query'][:40]:<40}{tier_str}")
return results
asyncio.run(run_tests())