diff --git a/config/profiles/home.yaml b/config/profiles/home.yaml index de704d9..5269812 100644 --- a/config/profiles/home.yaml +++ b/config/profiles/home.yaml @@ -6,13 +6,13 @@ profile: home region_name: "North America" tileset: - url: "/tiles/planet/current.pmtiles" + url: "/tiles/na.pmtiles" bounds: [-168, 14, -52, 72] max_zoom: 15 attribution: "Protomaps © OSM" tileset_hillshade: - url: "/tiles/planet-dem.pmtiles" + url: "/tiles/hillshade-na.pmtiles" encoding: "terrarium" max_zoom: 12 @@ -31,20 +31,16 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - features: has_nominatim_details: true - has_kiwix_wiki: true + has_kiwix_wiki: false has_hillshade: true has_3d_terrain: false has_traffic_overlay: true has_landclass: true has_public_lands_layer: true has_contours: true - has_contours_test: false + has_contours_test: true has_contours_test_10ft: false has_address_book_write: false has_overture_enrichment: true @@ -52,16 +48,7 @@ features: has_contacts: true has_wiki_rewriting: true has_wiki_discovery: false - has_usfs_trails: true - has_blm_trails: true defaults: center: [42.5736, -114.6066] zoom: 10 - -# Offroute wilderness routing -offroute: - osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf" - densify_interval_m: 100 - postgis_dsn: "dbname=padus" - diff --git a/config/profiles/minimal_pi.yaml b/config/profiles/minimal_pi.yaml index c2fd90a..e3ae0fd 100644 --- a/config/profiles/minimal_pi.yaml +++ b/config/profiles/minimal_pi.yaml @@ -26,11 +26,6 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" -# TODO(matt): confirm logout next= host for this profile -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - features: has_nominatim_details: false has_kiwix_wiki: false diff --git a/config/profiles/regional_pi.yaml b/config/profiles/regional_pi.yaml index b6f2cad..8e70cd6 100644 --- a/config/profiles/regional_pi.yaml +++ b/config/profiles/regional_pi.yaml @@ -31,11 +31,6 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" -# TODO(matt): confirm logout next= host for this profile -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - features: has_nominatim_details: true has_kiwix_wiki: false diff --git a/lib/address_book.py b/lib/address_book.py new file mode 100644 index 0000000..f9827f6 --- /dev/null +++ b/lib/address_book.py @@ -0,0 +1,160 @@ +""" +RECON Address Book — YAML-backed saved-location lookup. + +Provides named locations (home, work, etc.) that short-circuit Photon +geocoding when an exact alias match is found. + +Config: /opt/recon/config/address_book.yaml +""" + +import os +import re +import threading + +import yaml + +from .utils import setup_logging + +logger = setup_logging('recon.address_book') + +_CONFIG_PATH = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'config', 'address_book.yaml', +) + +_lock = threading.Lock() +_entries: list[dict] = [] +_mtime: float = 0.0 + + +def _reload_if_changed(): + """Reload the YAML file if its mtime has changed.""" + global _entries, _mtime + try: + st = os.stat(_CONFIG_PATH) + except FileNotFoundError: + logger.warning("Address book not found: %s", _CONFIG_PATH) + _entries = [] + _mtime = 0.0 + return + + if st.st_mtime == _mtime: + return + + with _lock: + # Double-check after acquiring lock + try: + st = os.stat(_CONFIG_PATH) + except FileNotFoundError: + _entries = [] + _mtime = 0.0 + return + if st.st_mtime == _mtime: + return + + with open(_CONFIG_PATH, 'r') as f: + data = yaml.safe_load(f) or {} + + raw = data.get('entries', []) + loaded = [] + for entry in raw: + # Normalise aliases to lowercase for matching + aliases = [a.lower() for a in entry.get('aliases', [])] + loaded.append({ + 'id': entry.get('id', ''), + 'name': entry.get('name', ''), + 'aliases': aliases, + 'address': entry.get('address', ''), + 'lat': entry.get('lat'), + 'lon': entry.get('lon'), + 'tags': entry.get('tags', []), + }) + _entries = loaded + _mtime = st.st_mtime + logger.info("Address book loaded: %d entries from %s", len(_entries), _CONFIG_PATH) + + +def load(): + """Ensure the address book is loaded (and refreshed if the file changed).""" + _reload_if_changed() + return _entries + + +def _normalize(text: str) -> str: + """Lowercase, strip, remove commas, collapse whitespace.""" + t = text.strip().lower() + t = t.replace(',', ' ') + return ' '.join(t.split()) + + +def lookup(query: str): + """ + Look up a query against name and aliases. + + Returns dict with the matching entry plus a 'confidence' field: + - "exact": full name/alias match, OR query starts with alias + word boundary + - "partial": alias starts with query + word boundary, or alias appears + as a contiguous token sequence inside the query + - None if no match + + Matching order (first exact wins, else first partial): + 1. normalized(query) == normalized(name or alias) → exact + 2. normalized(query) starts with normalized(alias) + " " → exact + 3. normalized(alias) starts with normalized(query) + " " → partial + 4. normalized(alias) is a contiguous token sub-sequence → partial + """ + _reload_if_changed() + q = _normalize(query) + if not q: + return None + + first_exact = None + first_partial = None + + for entry in _entries: + norm_name = _normalize(entry['name']) + check_aliases = [_normalize(a) for a in entry.get('aliases', [])] + all_forms = [norm_name] + check_aliases + + for form in all_forms: + if not form: + continue + + # Rule 1: exact match + if q == form: + return {**entry, 'confidence': 'exact'} + + # Rule 2: query starts with alias + word boundary + if q.startswith(form + ' '): + if first_exact is None: + first_exact = entry + continue + + # Rule 3: alias starts with query (user still typing) + if form.startswith(q) and len(q) < len(form): + if first_partial is None: + first_partial = entry + continue + + # Rule 4: alias is contiguous token sub-sequence in query + # Build regex: token1\s+token2\s+...tokenN + tokens = form.split() + if len(tokens) >= 1: + pattern = r'(?:^|\s)' + r'\s+'.join(re.escape(t) for t in tokens) + r'(?:\s|$)' + if re.search(pattern, q): + if first_partial is None: + first_partial = entry + + if first_exact is not None: + return {**first_exact, 'confidence': 'exact'} + + if first_partial is not None: + return {**first_partial, 'confidence': 'partial'} + + return None + + +def list_all(): + """Return all address book entries.""" + _reload_if_changed() + return list(_entries) diff --git a/lib/address_book_api.py b/lib/address_book_api.py new file mode 100644 index 0000000..020828b --- /dev/null +++ b/lib/address_book_api.py @@ -0,0 +1,31 @@ +""" +RECON Address Book API — Flask Blueprint. + +GET /api/address_book/lookup?q= — best match or 404 +GET /api/address_book/list — all entries +""" + +from flask import Blueprint, request, jsonify + +from . import address_book + +address_book_bp = Blueprint('address_book', __name__) + + +@address_book_bp.route('/api/address_book/lookup') +def api_address_book_lookup(): + q = request.args.get('q', '').strip() + if not q: + return jsonify({'error': 'Missing q parameter'}), 400 + + result = address_book.lookup(q) + if result is None: + return '', 404 + + return jsonify(result) + + +@address_book_bp.route('/api/address_book/list') +def api_address_book_list(): + entries = address_book.list_all() + return jsonify(entries) diff --git a/lib/address_book_test.py b/lib/address_book_test.py new file mode 100644 index 0000000..75905f0 --- /dev/null +++ b/lib/address_book_test.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Tests for RECON address book module.""" +import sys +import os + +# Add project root to path +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from lib import address_book + +TESTS = [ + # ── Existing tests ── + ("lookup('home') → exact", + lambda: address_book.lookup("home"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('Home') → exact (case-insensitive)", + lambda: address_book.lookup("Home"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('214 north st') → exact via alias", + lambda: address_book.lookup("214 north st"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('214 North Street') → exact via alias", + lambda: address_book.lookup("214 North Street"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('nonexistent place') → None", + lambda: address_book.lookup("nonexistent place"), + lambda r: r is None), + + ("list_all() → 1 entry", + lambda: address_book.list_all(), + lambda r: isinstance(r, list) and len(r) == 1 and r[0]['id'] == 'home'), + + # ── New prefix+boundary tests ── + ("lookup('214 north st filer') → exact (query starts with alias)", + lambda: address_book.lookup("214 north st filer"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('214 North St Filer ID') → exact (case + trailing state)", + lambda: address_book.lookup("214 North St Filer ID"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('214 north st, filer, id') → exact (commas stripped)", + lambda: address_book.lookup("214 north st, filer, id"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('home today') → exact (short alias + trailing text)", + lambda: address_book.lookup("home today"), + lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), + + ("lookup('214') → partial (query is prefix of alias)", + lambda: address_book.lookup("214"), + lambda r: r is not None and r['confidence'] == 'partial'), + + ("lookup('214 n') → partial (partial prefix of alias)", + lambda: address_book.lookup("214 n"), + lambda r: r is not None and r['confidence'] == 'partial'), + + ("lookup('completely unrelated query') → None", + lambda: address_book.lookup("completely unrelated query"), + lambda r: r is None), + + ("lookup('214 north streets of filer') → None (no word boundary after st)", + lambda: address_book.lookup("214 north streets of filer"), + lambda r: r is None), +] + +passed = 0 +failed = 0 +for name, fn, check in TESTS: + try: + result = fn() + ok = check(result) + except Exception as e: + ok = False + result = f"EXCEPTION: {e}" + + status = "PASS" if ok else "FAIL" + if ok: + passed += 1 + else: + failed += 1 + print(f" [{status}] {name}") + if not ok: + print(f" got: {result}") + +print(f"\n{passed} passed, {failed} failed") +sys.exit(0 if failed == 0 else 1) diff --git a/lib/api.py b/lib/api.py index a0697bf..8a1f383 100644 --- a/lib/api.py +++ b/lib/api.py @@ -17,13 +17,16 @@ import shutil import tempfile import requests as http_requests -from flask import Flask, request, jsonify, redirect, render_template +from flask import Flask, request, jsonify, redirect, render_template, make_response from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue from werkzeug.utils import secure_filename from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging from .status import StatusDB +from .deployment_config import get_deployment_config +from .place_detail import get_place_detail, get_place_by_wikidata +from .landclass import lookup_landclass, format_summary logger = setup_logging('recon.api') @@ -57,9 +60,19 @@ class _LargeZimRequest(_FlaskRequest): return super()._get_file_stream(total_content_length, content_type, filename, content_length) app.request_class = _LargeZimRequest -# ── Netsyms Blueprint ── -from .netsyms_api import netsyms_bp +# ── Address Book Blueprint ── +from .address_book_api import address_book_bp +app.register_blueprint(address_book_bp) + +# ── Contacts Blueprint ── +from .contacts_api import contacts_bp +app.register_blueprint(contacts_bp) + +# ── Netsyms + Geocode Blueprints ── +from .netsyms_api import netsyms_bp, geocode_bp app.register_blueprint(netsyms_bp) +app.register_blueprint(geocode_bp) + # ── Navigation Constants ── @@ -89,6 +102,12 @@ SETTINGS_SUBNAV = [ {'href': '/settings/health', 'label': 'Service Health'}, ] +NAVI_SUBNAV = [ + {'href': '/nav-i', 'label': 'Overview'}, + {'href': '/deleted-contacts', 'label': 'Deleted Contacts'}, + {'href': '/nav-i/api-keys', 'label': 'API Keys'}, +] + def _format_source_citation(payload): """Format a human-readable citation from a search result payload.""" @@ -315,6 +334,36 @@ def failures_page(): failures=failures) +@app.route("/deleted-contacts") +def deleted_contacts_page(): + from .auth import get_user_id + from .contacts import ContactsDB + user_id = get_user_id() or "anonymous" + db = ContactsDB() + contacts = db.list_deleted(user_id) + return render_template("navi/deleted_contacts.html", + domain="navi", subnav=NAVI_SUBNAV, active_page="/deleted-contacts", + contacts=contacts) + + +@app.route("/nav-i") +def navi_landing_page(): + from .auth import get_user_id + from .contacts import ContactsDB + user_id = get_user_id() or "anonymous" + db = ContactsDB() + deleted_count = len(db.list_deleted(user_id)) + return render_template("navi/landing.html", + domain="navi", subnav=NAVI_SUBNAV, active_page="/nav-i", + deleted_count=deleted_count) + + +@app.route("/nav-i/api-keys") +def navi_api_keys_page(): + return render_template("navi/api_keys.html", + domain="navi", subnav=NAVI_SUBNAV, active_page="/nav-i/api-keys") + + @app.route('/peertube') def peertube_dashboard(): return render_template('peertube/dashboard.html', @@ -1159,6 +1208,82 @@ def api_knowledge_stats(): return jsonify(_cache['knowledge_stats']) + +@app.route('/api/traffic/flow///.png') +def api_traffic_flow(z, x, y): + """Proxy TomTom traffic flow tiles to hide API key from frontend.""" + key = os.environ.get('TOMTOM_API_KEY') + if not key: + return 'Traffic service not configured', 503 + url = f'https://api.tomtom.com/traffic/map/4/tile/flow/relative/{z}/{x}/{y}.png?key={key}' + try: + resp = http_requests.get(url, timeout=10) + if resp.status_code != 200: + return 'Upstream error', 502 + r = make_response(resp.content) + r.headers['Content-Type'] = 'image/png' + r.headers['Cache-Control'] = 'public, max-age=120' + return r + except Exception: + return 'Upstream timeout', 504 + + +@app.route('/api/place//') +def api_place_detail(osm_type, osm_id): + """Proxy place details from local Nominatim or Overpass API.""" + result, status = get_place_detail(osm_type, osm_id) + return jsonify(result), status + + +@app.route("/api/place/wikidata/") +def api_place_wikidata(wikidata_id): + """Fetch place details from Wikidata entity.""" + result, status = get_place_by_wikidata(wikidata_id) + return jsonify(result), status + + + +@app.route('/api/landclass') +def api_landclass(): + """PAD-US land classification lookup for a point.""" + config = get_deployment_config() + if not config.get('features', {}).get('has_landclass'): + return jsonify({'error': 'Land classification not available'}), 404 + + try: + lat = float(request.args.get('lat', '')) + lon = float(request.args.get('lon', '')) + except (ValueError, TypeError): + return jsonify({'error': 'lat and lon required as numbers'}), 400 + + if not (-90 <= lat <= 90) or not (-180 <= lon <= 180): + return jsonify({'error': 'lat must be -90..90, lon must be -180..180'}), 400 + + classifications = lookup_landclass(lat, lon) + is_public = len(classifications) > 0 + is_private = len(classifications) == 0 + summary = format_summary(classifications) + + return jsonify({ + 'lat': lat, + 'lon': lon, + 'classifications': classifications, + 'count': len(classifications), + 'is_public': is_public, + 'is_private': is_private, + 'summary': summary, + }) + + +@app.route('/api/config') +def api_config(): + """Return deployment profile config for frontend consumption.""" + config = get_deployment_config() + resp = jsonify(config) + resp.headers['Cache-Control'] = 'public, max-age=300' + return resp + + @app.route('/api/health') def api_health(): """Health check endpoint for monitoring.""" @@ -1320,6 +1445,60 @@ def api_keys_reload(): +# ── Nav-I API Key Admin ── + +@app.route('/api/nav-i/api-keys/list', methods=['GET']) +def navi_api_keys_list(): + from .api_keys_admin import list_keys + return jsonify({'keys': list_keys()}) + + +@app.route('/api/nav-i/api-keys/update', methods=['POST']) +def navi_api_keys_update(): + from .auth import require_auth + from .api_keys_admin import update_key, update_gemini_key + data = request.get_json(force=True) + name = data.get('name', '') + new_value = data.get('new_value', '') + index = data.get('index') # optional, for Gemini key replacement + if not name or not new_value: + return jsonify({'error': 'name and new_value required'}), 400 + if name == 'GEMINI_KEY' and index is not None: + result = update_gemini_key(int(index), new_value) + else: + result = update_key(name, new_value) + if result.get('success'): + return jsonify(result) + return jsonify(result), 400 + + +@app.route('/api/nav-i/api-keys/test', methods=['POST']) +def navi_api_keys_test(): + from .api_keys_admin import test_key + data = request.get_json(force=True) + name = data.get('name', '') + index = data.get('index') # optional, for testing specific Gemini key + if not name: + return jsonify({'error': 'name required'}), 400 + result = test_key(name, index=int(index) if index is not None else None) + return jsonify(result) + + +@app.route('/api/nav-i/api-keys/restart-recon', methods=['POST']) +def navi_api_keys_restart(): + import subprocess + try: + result = subprocess.run( + ['sudo', 'systemctl', 'restart', 'recon'], + capture_output=True, text=True, timeout=30 + ) + if result.returncode == 0: + return jsonify({'success': True, 'note': 'RECON service restarted'}) + return jsonify({'success': False, 'error': result.stderr.strip()}), 500 + except subprocess.TimeoutExpired: + return jsonify({'success': False, 'error': 'Restart timed out'}), 500 + except Exception as e: + return jsonify({'success': False, 'error': str(e)}), 500 # ── YouTube Cookie Management ── @@ -2525,3 +2704,21 @@ def api_metrics_history(): return jsonify({'type': metric_type, 'hours': hours, 'points': points}) except Exception as e: return jsonify({'type': metric_type, 'hours': hours, 'points': [], 'error': str(e)}) + + +# ── Auth state endpoint ───────────────────────────────────────────────────── +# Returns current auth state for frontend consumption. +# This endpoint must be behind Caddy forward_auth to receive X-Authentik-* headers. +@app.route('/api/auth/whoami') +def api_auth_whoami(): + """Return auth state for frontend. Behind forward_auth, so headers are present when authenticated.""" + username = request.headers.get('X-Authentik-Username') + if username: + return jsonify({ + 'authenticated': True, + 'username': username, + }) + return jsonify({ + 'authenticated': False, + 'username': None, + }) diff --git a/lib/api_keys_admin.py b/lib/api_keys_admin.py new file mode 100644 index 0000000..3c63565 --- /dev/null +++ b/lib/api_keys_admin.py @@ -0,0 +1,358 @@ +""" +Nav-I API Keys Admin — unified view/update/test for third-party API keys. + +Manages three provider categories: + - Gemini (multiple keys via KeyManager singleton) + - TomTom (single key in .env) + - Google Places (single key in .env) + +All key values are masked in responses. Full values never leave the server +except as user-supplied input on update. +""" +import os +import re +import shutil +import tempfile +import time + +import requests as http_requests + +from .utils import setup_logging + +logger = setup_logging('recon.api_keys_admin') + +ENV_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '.env') + +# Key definitions: env_name → display metadata +_KEY_DEFS = { + 'TOMTOM_API_KEY': { + 'display_name': 'TomTom', + 'provider': 'tomtom', + }, + 'GOOGLE_PLACES_API_KEY': { + 'display_name': 'Google Places', + 'provider': 'google_places', + }, +} + + +# ── .env read/write helpers ───────────────────────────────────────────── + +def _read_env(): + """Read .env file into a dict of key=value pairs, preserving order.""" + entries = [] # list of (key, value, raw_line) — preserves order and comments + if not os.path.exists(ENV_PATH): + return entries + with open(ENV_PATH, 'r') as f: + for line in f: + raw = line.rstrip('\n') + stripped = raw.strip() + if not stripped or stripped.startswith('#'): + entries.append((None, None, raw)) + continue + m = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)=(.*)$', stripped) + if m: + entries.append((m.group(1), m.group(2).strip().strip('"').strip("'"), raw)) + else: + entries.append((None, None, raw)) + return entries + + +def _write_env(entries): + """Atomically write .env from entries list. Backs up to .env.bak first.""" + # Backup current .env + if os.path.exists(ENV_PATH): + bak_path = ENV_PATH + '.bak' + shutil.copy2(ENV_PATH, bak_path) + + # Write to temp file, then rename (atomic on same filesystem) + fd, tmp_path = tempfile.mkstemp(dir=os.path.dirname(ENV_PATH), prefix='.env.', suffix='.tmp') + try: + with os.fdopen(fd, 'w') as f: + for key, value, raw in entries: + if key is not None: + f.write(f'{key}={value}\n') + else: + f.write(raw + '\n') + os.rename(tmp_path, ENV_PATH) + except Exception: + # Clean up temp file on failure + try: + os.unlink(tmp_path) + except OSError: + pass + raise + + logger.info(f"Wrote .env atomically ({len([e for e in entries if e[0]])} keys)") + + +def _get_env_value(name): + """Get a single value from .env by key name.""" + for key, value, _ in _read_env(): + if key == name: + return value + return None + + +def _set_env_value(name, new_value): + """Set a single value in .env. Adds if not present.""" + entries = _read_env() + found = False + for i, (key, value, raw) in enumerate(entries): + if key == name: + entries[i] = (name, new_value, f'{name}={new_value}') + found = True + break + if not found: + entries.append((name, new_value, f'{name}={new_value}')) + _write_env(entries) + + +# ── Masking ───────────────────────────────────────────────────────────── + +def _mask_key(value): + """Mask a key: first 4 chars + '...' + last 4 chars. Never return full value.""" + if not value: + return None + if len(value) <= 8: + return '****' + return value[:4] + '...' + value[-4:] + + +# ── List ──────────────────────────────────────────────────────────────── + +def list_keys(): + """ + Return masked status of all managed API keys. + + Returns list of dicts with: name, display_name, provider, masked_value, + is_set, count (for multi-key providers like Gemini). + """ + result = [] + env_mtime = None + if os.path.exists(ENV_PATH): + env_mtime = time.strftime('%Y-%m-%dT%H:%M:%SZ', + time.gmtime(os.path.getmtime(ENV_PATH))) + + # Gemini keys (via KeyManager) + from .key_manager import get_key_manager + km = get_key_manager() + gemini_keys = km.get_masked_keys() + gemini_count = len(gemini_keys) + # Show a single summary entry for Gemini with count + first_masked = gemini_keys[0]['masked'] if gemini_keys else None + result.append({ + 'name': 'GEMINI_KEY', + 'display_name': 'Gemini', + 'provider': 'gemini', + 'masked_value': first_masked, + 'is_set': gemini_count > 0, + 'count': gemini_count, + 'last_modified': env_mtime, + 'keys': gemini_keys, # full list with per-key stats + }) + + # Single-value keys + for env_name, meta in _KEY_DEFS.items(): + value = _get_env_value(env_name) + result.append({ + 'name': env_name, + 'display_name': meta['display_name'], + 'provider': meta['provider'], + 'masked_value': _mask_key(value), + 'is_set': bool(value), + 'count': 1 if value else 0, + 'last_modified': env_mtime, + }) + + return result + + +# ── Update ────────────────────────────────────────────────────────────── + +def update_key(name, new_value): + """ + Update a key value. For Gemini, name should be 'GEMINI_KEY' with an + optional 'index' for replacing a specific key, or use the KeyManager API. + For TomTom/Google Places, writes directly to .env. + + Returns dict with success status and masked value. + """ + new_value = new_value.strip() + if not new_value: + return {'success': False, 'error': 'Key value cannot be empty'} + + if name == 'GEMINI_KEY': + # Use KeyManager for Gemini + from .key_manager import get_key_manager + km = get_key_manager() + try: + idx = km.add_gemini_key(new_value) + return { + 'success': True, + 'name': name, + 'masked_value': _mask_key(new_value), + 'action': 'added', + 'index': idx, + } + except ValueError as e: + return {'success': False, 'error': str(e)} + + if name in _KEY_DEFS: + _set_env_value(name, new_value) + return { + 'success': True, + 'name': name, + 'masked_value': _mask_key(new_value), + 'action': 'updated', + } + + return {'success': False, 'error': f'Unknown key: {name}'} + + +def update_gemini_key(index, new_value): + """Replace a specific Gemini key by index.""" + new_value = new_value.strip() + if not new_value: + return {'success': False, 'error': 'Key value cannot be empty'} + + from .key_manager import get_key_manager + km = get_key_manager() + try: + km.replace_gemini_key(index, new_value) + return { + 'success': True, + 'name': 'GEMINI_KEY', + 'index': index, + 'masked_value': _mask_key(new_value), + 'action': 'replaced', + } + except (ValueError, IndexError) as e: + return {'success': False, 'error': str(e)} + + +# ── Test ──────────────────────────────────────────────────────────────── + +def test_key(name, index=None): + """ + Test a key against its provider API using the current .env value. + + Returns dict with: success, latency_ms, error, note. + """ + if name == 'GEMINI_KEY': + return _test_gemini(index) + elif name == 'TOMTOM_API_KEY': + return _test_tomtom() + elif name == 'GOOGLE_PLACES_API_KEY': + return _test_google_places() + else: + return {'success': False, 'error': f'Unknown key: {name}', 'latency_ms': 0} + + +def _test_gemini(index=None): + """Test Gemini key by listing models.""" + from .key_manager import get_key_manager + km = get_key_manager() + + if index is not None: + key = km.get_gemini_key(index) + if not key: + return {'success': False, 'error': f'Gemini key index {index} not found', 'latency_ms': 0} + else: + key = km.get_gemini_key(0) + if not key: + return {'success': False, 'error': 'No Gemini keys configured', 'latency_ms': 0} + + t0 = time.time() + try: + resp = http_requests.get( + f"https://generativelanguage.googleapis.com/v1beta/models?key={key}", + timeout=10 + ) + latency = int((time.time() - t0) * 1000) + + if resp.status_code == 200 and 'models' in resp.text: + return {'success': True, 'latency_ms': latency, 'error': None, + 'note': 'Models list returned successfully'} + elif resp.status_code == 403: + return {'success': False, 'latency_ms': latency, + 'error': 'Key disabled or quota exhausted'} + elif resp.status_code == 429: + return {'success': True, 'latency_ms': latency, 'error': None, + 'note': 'Valid key — currently rate-limited'} + else: + return {'success': False, 'latency_ms': latency, + 'error': f'HTTP {resp.status_code}'} + except Exception as e: + latency = int((time.time() - t0) * 1000) + return {'success': False, 'latency_ms': latency, 'error': str(e)} + + +def _test_tomtom(): + """Test TomTom key with a minimal geocode request.""" + key = _get_env_value('TOMTOM_API_KEY') + if not key: + return {'success': False, 'error': 'TOMTOM_API_KEY not set', 'latency_ms': 0} + + t0 = time.time() + try: + resp = http_requests.get( + f"https://api.tomtom.com/search/2/geocode/Boise.json", + params={'key': key, 'limit': 1}, + timeout=10 + ) + latency = int((time.time() - t0) * 1000) + + if resp.status_code == 200: + data = resp.json() + count = data.get('summary', {}).get('totalResults', 0) + return {'success': True, 'latency_ms': latency, 'error': None, + 'note': f'Geocode returned {count} result(s)'} + elif resp.status_code == 403: + return {'success': False, 'latency_ms': latency, + 'error': 'Invalid or expired key'} + else: + return {'success': False, 'latency_ms': latency, + 'error': f'HTTP {resp.status_code}'} + except Exception as e: + latency = int((time.time() - t0) * 1000) + return {'success': False, 'latency_ms': latency, 'error': str(e)} + + +def _test_google_places(): + """Test Google Places (New) API key with a minimal searchText request.""" + key = _get_env_value('GOOGLE_PLACES_API_KEY') + if not key: + return {'success': False, 'error': 'GOOGLE_PLACES_API_KEY not set', 'latency_ms': 0} + + t0 = time.time() + try: + resp = http_requests.post( + "https://places.googleapis.com/v1/places:searchText", + json={'textQuery': 'Boise Idaho', 'maxResultCount': 1}, + headers={ + 'X-Goog-Api-Key': key, + 'X-Goog-FieldMask': 'places.displayName', + }, + timeout=10 + ) + latency = int((time.time() - t0) * 1000) + + if resp.status_code == 200: + data = resp.json() + count = len(data.get('places', [])) + return {'success': True, 'latency_ms': latency, 'error': None, + 'note': f'searchText returned {count} place(s)'} + elif resp.status_code == 403: + return {'success': False, 'latency_ms': latency, + 'error': 'Key not authorized for Places API (New)'} + elif resp.status_code == 429: + return {'success': True, 'latency_ms': latency, 'error': None, + 'note': 'Valid key — quota exceeded'} + else: + body = resp.text[:200] + return {'success': False, 'latency_ms': latency, + 'error': f'HTTP {resp.status_code}: {body}'} + except Exception as e: + latency = int((time.time() - t0) * 1000) + return {'success': False, 'latency_ms': latency, 'error': str(e)} diff --git a/lib/contacts.py b/lib/contacts.py new file mode 100644 index 0000000..f2782db --- /dev/null +++ b/lib/contacts.py @@ -0,0 +1,230 @@ +""" +RECON Contacts Database — per-user phone book with soft delete and proximity queries. + +Separate DB at data/contacts.db. Thread-local connections with WAL mode (StatusDB pattern). +""" +import math +import os +import sqlite3 +import threading +from datetime import datetime, timezone + +_local = threading.local() + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS contacts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id TEXT NOT NULL, + label TEXT NOT NULL, + name TEXT, + call_sign TEXT, + phone TEXT, + email TEXT, + category TEXT, + notes TEXT, + lat REAL, + lon REAL, + osm_type TEXT, + osm_id INTEGER, + address TEXT, + show_proximity INTEGER DEFAULT 0, + created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), + updated_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), + deleted_at TEXT, + deleted_by TEXT +); + +CREATE INDEX IF NOT EXISTS idx_contacts_user ON contacts(user_id); +CREATE INDEX IF NOT EXISTS idx_contacts_user_category ON contacts(user_id, category); +CREATE INDEX IF NOT EXISTS idx_contacts_user_deleted ON contacts(user_id, deleted_at); +CREATE INDEX IF NOT EXISTS idx_contacts_geo ON contacts(lat, lon); +CREATE UNIQUE INDEX IF NOT EXISTS idx_contacts_home_work + ON contacts(user_id, label) + WHERE label IN ('Home', 'Work') AND deleted_at IS NULL; +""" + + +def _haversine_m(lat1, lon1, lat2, lon2): + """Haversine distance in meters.""" + R = 6_371_000 + rlat1, rlat2 = math.radians(lat1), math.radians(lat2) + dlat = math.radians(lat2 - lat1) + dlon = math.radians(lon2 - lon1) + a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 + return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) + + +def _row_to_dict(row): + """Convert sqlite3.Row to dict, casting show_proximity to bool.""" + d = dict(row) + d['show_proximity'] = bool(d.get('show_proximity', 0)) + return d + + +class ContactsDB: + def __init__(self, db_path=None): + if db_path is None: + db_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'contacts.db') + self.db_path = db_path + os.makedirs(os.path.dirname(db_path), exist_ok=True) + self._init_db() + + def _get_conn(self): + if not hasattr(_local, 'contacts_conn') or _local.contacts_conn is None: + _local.contacts_conn = sqlite3.connect(self.db_path, timeout=30) + _local.contacts_conn.row_factory = sqlite3.Row + _local.contacts_conn.execute("PRAGMA journal_mode=WAL") + _local.contacts_conn.execute("PRAGMA busy_timeout=5000") + return _local.contacts_conn + + def _init_db(self): + conn = self._get_conn() + conn.executescript(_SCHEMA) + conn.commit() + + def list_all(self, user_id, category=None, search=None): + conn = self._get_conn() + sql = "SELECT * FROM contacts WHERE user_id = ? AND deleted_at IS NULL" + params = [user_id] + if category: + sql += " AND category = ?" + params.append(category) + if search: + sql += " AND (label LIKE ? OR name LIKE ? OR call_sign LIKE ? OR phone LIKE ?)" + like = f"%{search}%" + params.extend([like, like, like, like]) + sql += " ORDER BY label" + return [_row_to_dict(r) for r in conn.execute(sql, params).fetchall()] + + def list_deleted(self, user_id): + conn = self._get_conn() + rows = conn.execute( + "SELECT * FROM contacts WHERE user_id = ? AND deleted_at IS NOT NULL ORDER BY deleted_at DESC", + (user_id,) + ).fetchall() + return [_row_to_dict(r) for r in rows] + + def get(self, user_id, contact_id, include_deleted=False): + conn = self._get_conn() + sql = "SELECT * FROM contacts WHERE id = ? AND user_id = ?" + if not include_deleted: + sql += " AND deleted_at IS NULL" + row = conn.execute(sql, (contact_id, user_id)).fetchone() + return _row_to_dict(row) if row else None + + def create(self, user_id, **fields): + conn = self._get_conn() + fields.pop('id', None) + fields.pop('user_id', None) + fields.pop('created_at', None) + fields.pop('updated_at', None) + fields.pop('deleted_at', None) + fields.pop('deleted_by', None) + if 'show_proximity' in fields: + fields['show_proximity'] = 1 if fields['show_proximity'] else 0 + columns = ['user_id'] + list(fields.keys()) + placeholders = ', '.join(['?'] * len(columns)) + col_str = ', '.join(columns) + values = [user_id] + list(fields.values()) + try: + cur = conn.execute(f"INSERT INTO contacts ({col_str}) VALUES ({placeholders})", values) + conn.commit() + return self.get(user_id, cur.lastrowid), None + except sqlite3.IntegrityError: + return None, 'conflict' + + def update(self, user_id, contact_id, **fields): + conn = self._get_conn() + fields.pop('id', None) + fields.pop('user_id', None) + fields.pop('created_at', None) + fields.pop('deleted_at', None) + fields.pop('deleted_by', None) + if 'show_proximity' in fields: + fields['show_proximity'] = 1 if fields['show_proximity'] else 0 + fields['updated_at'] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ') + sets = ', '.join(f"{k} = ?" for k in fields) + values = list(fields.values()) + [contact_id, user_id] + conn.execute(f"UPDATE contacts SET {sets} WHERE id = ? AND user_id = ? AND deleted_at IS NULL", values) + conn.commit() + return self.get(user_id, contact_id) + + def soft_delete(self, user_id, contact_id): + conn = self._get_conn() + now = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ') + conn.execute( + "UPDATE contacts SET deleted_at = ?, deleted_by = ? WHERE id = ? AND user_id = ? AND deleted_at IS NULL", + (now, user_id, contact_id, user_id) + ) + conn.commit() + return self.get(user_id, contact_id, include_deleted=True) + + def restore(self, user_id, contact_id): + conn = self._get_conn() + row = self.get(user_id, contact_id, include_deleted=True) + if not row or not row.get('deleted_at'): + return None, 'not_found' + if row.get('label') in ('Home', 'Work'): + existing = conn.execute( + "SELECT id FROM contacts WHERE user_id = ? AND label = ? AND deleted_at IS NULL AND id != ?", + (user_id, row['label'], contact_id) + ).fetchone() + if existing: + return None, 'conflict' + conn.execute( + "UPDATE contacts SET deleted_at = NULL, deleted_by = NULL WHERE id = ? AND user_id = ?", + (contact_id, user_id) + ) + conn.commit() + return self.get(user_id, contact_id), None + + def restore_as(self, user_id, contact_id, new_label): + """Restore a soft-deleted contact with a new label (for Home/Work conflict resolution).""" + conn = self._get_conn() + row = self.get(user_id, contact_id, include_deleted=True) + if not row or not row.get('deleted_at'): + return None, 'not_found' + if not new_label or not new_label.strip(): + return None, 'invalid_label' + now = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%fZ') + try: + conn.execute( + "UPDATE contacts SET deleted_at = NULL, deleted_by = NULL, label = ?, updated_at = ? WHERE id = ? AND user_id = ?", + (new_label.strip(), now, contact_id, user_id) + ) + conn.commit() + except sqlite3.IntegrityError: + return None, 'conflict' + return self.get(user_id, contact_id), None + + def purge(self, user_id, contact_id): + conn = self._get_conn() + row = self.get(user_id, contact_id, include_deleted=True) + if not row: + return False, 'not_found' + if not row.get('deleted_at'): + return False, 'not_deleted' + conn.execute("DELETE FROM contacts WHERE id = ? AND user_id = ?", (contact_id, user_id)) + conn.commit() + return True, None + + def find_nearby(self, user_id, lat, lon, radius_m=75): + conn = self._get_conn() + # Bounding box pre-filter (~111km per degree lat) + dlat = radius_m / 111_000 + dlon = radius_m / (111_000 * math.cos(math.radians(lat))) + rows = conn.execute( + """SELECT * FROM contacts + WHERE user_id = ? AND deleted_at IS NULL AND show_proximity = 1 + AND lat BETWEEN ? AND ? AND lon BETWEEN ? AND ?""", + (user_id, lat - dlat, lat + dlat, lon - dlon, lon + dlon) + ).fetchall() + results = [] + for r in rows: + dist = _haversine_m(lat, lon, r['lat'], r['lon']) + if dist <= radius_m: + d = _row_to_dict(r) + d['distance_m'] = round(dist, 1) + results.append(d) + results.sort(key=lambda x: x['distance_m']) + return results diff --git a/lib/contacts_api.py b/lib/contacts_api.py new file mode 100644 index 0000000..0e4506b --- /dev/null +++ b/lib/contacts_api.py @@ -0,0 +1,132 @@ +""" +RECON Contacts API — Flask Blueprint. + +Per-user phone book with soft delete, restore, purge, and proximity queries. +All endpoints require Authentik forward-auth (X-Authentik-Username header). +""" +from flask import Blueprint, request, jsonify + +from .auth import require_auth +from .contacts import ContactsDB + +contacts_bp = Blueprint('contacts', __name__) + +_db = None + +def _get_db(): + global _db + if _db is None: + _db = ContactsDB() + return _db + + +@contacts_bp.route('/api/contacts', methods=['GET']) +@require_auth +def list_contacts(): + db = _get_db() + category = request.args.get('category') + search = request.args.get('search') + return jsonify(db.list_all(request.user_id, category=category, search=search)) + + +@contacts_bp.route('/api/contacts', methods=['POST']) +@require_auth +def create_contact(): + db = _get_db() + data = request.get_json(force=True) + contact, err = db.create(request.user_id, **data) + if err == 'conflict': + return jsonify({'error': 'You already have a Home/Work contact'}), 409 + return jsonify(contact), 201 + + +@contacts_bp.route('/api/contacts/nearby', methods=['GET']) +@require_auth +def nearby_contacts(): + db = _get_db() + lat = request.args.get('lat', type=float) + lon = request.args.get('lon', type=float) + radius_m = request.args.get('radius_m', 75, type=float) + if lat is None or lon is None: + return jsonify({'error': 'lat and lon required'}), 400 + return jsonify(db.find_nearby(request.user_id, lat, lon, radius_m)) + + +@contacts_bp.route('/api/contacts/deleted', methods=['GET']) +@require_auth +def list_deleted(): + db = _get_db() + return jsonify(db.list_deleted(request.user_id)) + + +@contacts_bp.route('/api/contacts/', methods=['GET']) +@require_auth +def get_contact(contact_id): + db = _get_db() + contact = db.get(request.user_id, contact_id) + if not contact: + return jsonify({'error': 'Not found'}), 404 + return jsonify(contact) + + +@contacts_bp.route('/api/contacts/', methods=['PATCH']) +@require_auth +def update_contact(contact_id): + db = _get_db() + data = request.get_json(force=True) + contact = db.update(request.user_id, contact_id, **data) + if not contact: + return jsonify({'error': 'Not found'}), 404 + return jsonify(contact) + + +@contacts_bp.route('/api/contacts/', methods=['DELETE']) +@require_auth +def delete_contact(contact_id): + db = _get_db() + contact = db.soft_delete(request.user_id, contact_id) + if not contact: + return jsonify({'error': 'Not found'}), 404 + return jsonify(contact) + + +@contacts_bp.route('/api/contacts//restore', methods=['POST']) +@require_auth +def restore_contact(contact_id): + db = _get_db() + contact, err = db.restore(request.user_id, contact_id) + if err == 'not_found': + return jsonify({'error': 'Not found'}), 404 + if err == 'conflict': + return jsonify({'error': 'You already have a Home/Work contact'}), 409 + return jsonify(contact) + + +@contacts_bp.route('/api/contacts//restore-as', methods=['POST']) +@require_auth +def restore_as_contact(contact_id): + db = _get_db() + data = request.get_json(force=True) + new_label = data.get('label', '').strip() + if not new_label: + return jsonify({'error': 'label is required'}), 400 + contact, err = db.restore_as(request.user_id, contact_id, new_label) + if err == 'not_found': + return jsonify({'error': 'Not found'}), 404 + if err == 'invalid_label': + return jsonify({'error': 'Invalid label'}), 400 + if err == 'conflict': + return jsonify({'error': 'Label conflict'}), 409 + return jsonify(contact) + + +@contacts_bp.route('/api/contacts//purge', methods=['DELETE']) +@require_auth +def purge_contact(contact_id): + db = _get_db() + ok, err = db.purge(request.user_id, contact_id) + if err == 'not_found': + return jsonify({'error': 'Not found'}), 404 + if err == 'not_deleted': + return jsonify({'error': 'Contact must be deleted before purging'}), 400 + return jsonify({'ok': True}) diff --git a/lib/deployment_config.py b/lib/deployment_config.py index ab6aa17..978b8a0 100644 --- a/lib/deployment_config.py +++ b/lib/deployment_config.py @@ -3,15 +3,7 @@ Deployment profile loader. Reads RECON_PROFILE env var (default: "home"), loads the matching YAML from config/profiles/.yaml, and caches the parsed dict in memory. - -Exposes get_deployment_config() as the in-process accessor for the profile. - -Note: its former consumers (the /api/landclass gate, google_places, -place_detail, offroute/router) were all extracted to navi-* services or removed -across cleanups #4–#6/#27 — recon has no remaining caller of -get_deployment_config() today; the module is retained per cleanup #1. -(The former /api/config HTTP endpoint that served this dict to the frontend was -removed once navi-config (:8422) took over that route.) +Provides get_deployment_config() for use by the /api/config endpoint. """ import os import yaml diff --git a/lib/geocode.py b/lib/geocode.py new file mode 100644 index 0000000..aabd37e --- /dev/null +++ b/lib/geocode.py @@ -0,0 +1,774 @@ +""" +RECON geocode — structured preprocessing, multi-source retrieval, reranking. + +Replaces the naive Photon-only search with: + 1. usaddress parsing + intent classification (ADDRESS / POI / LOCALITY / COORD / POSTCODE) + 2. Multi-source retrieval: ADDRESS → Netsyms + Photon; POI/LOCALITY → Photon /api + 3. Python reranker with weighted signals + +Public entry point: geocode(query, limit) → {query, results, count} +""" + +import math +import re +import logging + +import requests +import usaddress +from rapidfuzz import fuzz + +from .utils import setup_logging + +logger = setup_logging('recon.geocode') + +# ── Trace logger for reranking audit ── +_trace_logger = logging.getLogger('recon.geocode.trace') +_trace_handler = logging.FileHandler('/tmp/geocode_rerank_trace.log') +_trace_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) +_trace_logger.addHandler(_trace_handler) +_trace_logger.setLevel(logging.DEBUG) + +# ── Config constants ── +PHOTON_URL = "http://localhost:2322" +GEOCODE_BIAS_LAT = 42.5736 +GEOCODE_BIAS_LON = -114.6066 +GEOCODE_BIAS_ZOOM = 10 +ADDRESS_BOOK_ANNOTATION_RADIUS_M = 75 + +# ── Reranker weights ── +# Derived from research analysis of failure modes: +# housenumber_exact is the strongest signal because Photon's soft-boost +# lets wrong-number results bubble up. street_name_fuzz and locality_fuzz +# handle abbreviation/case variation. source_authority gives Netsyms a +# boost for US addresses since it has USPS-verified data. +W_HOUSENUMBER_EXACT = 6.0 # exact housenumber match +W_HOUSENUMBER_MISMATCH = -5.0 # housenumber present but wrong +W_STREET_NAME_FUZZ = 3.0 # fuzzy street name similarity [0..1] * weight +W_TOKEN_COVERAGE = 2.0 # fraction of query tokens found in result +W_STREET_TYPE_MATCH = 1.5 # "st" matches "street", etc. +W_LOCALITY_FUZZ = 2.0 # city/state fuzzy match +W_SOURCE_AUTHORITY = 2.0 # Netsyms for US addresses +W_LAYER_RANK = 1.0 # type-appropriate results ranked higher +W_PHOTON_POSITION_NORM = 1.0 # Photon's native ranking (normalized by position) +W_STATE_EXACT = 1.0 # exact state code match +W_POI_CLASS_BOOST = 3.0 # amenity/shop/etc boost for business-name queries +W_HIGHWAY_CLASS_PENALTY = -4.0 # highway/route penalty for business-name queries + +# ── US abbreviation expansions ── +# Applied ONLY to parsed StreetName/StreetNamePostType tokens, NOT to ordinals. +_STREET_TYPE_ABBREVS = { + 'st': 'street', 'ave': 'avenue', 'blvd': 'boulevard', 'dr': 'drive', + 'rd': 'road', 'ln': 'lane', 'ct': 'court', 'cir': 'circle', + 'pl': 'place', 'way': 'way', 'pkwy': 'parkway', 'hwy': 'highway', + 'trl': 'trail', 'ter': 'terrace', 'sq': 'square', +} +_DIRECTIONAL_ABBREVS = { + 'n': 'north', 's': 'south', 'e': 'east', 'w': 'west', + 'ne': 'northeast', 'nw': 'northwest', 'se': 'southeast', 'sw': 'southwest', +} +_ORDINAL_RE = re.compile(r'^\d+(st|nd|rd|th)$', re.IGNORECASE) + +# ── Road keywords (for detecting when query is about a road vs a business) ── +_ROAD_KEYWORDS = ( + set(_STREET_TYPE_ABBREVS.keys()) + | set(_STREET_TYPE_ABBREVS.values()) + | {'route', 'rte', 'pass'} +) + +# ── US state codes ── +_STATE_CODES = { + 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', + 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', + 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', + 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', + 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC', +} + +# ── Full state name → code (for intent classifier) ── +_STATE_NAME_TO_CODE = { + 'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR', + 'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE', + 'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID', + 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS', + 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD', + 'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', + 'mississippi': 'MS', 'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', + 'nevada': 'NV', 'new hampshire': 'NH', 'new jersey': 'NJ', + 'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC', + 'north dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK', 'oregon': 'OR', + 'pennsylvania': 'PA', 'rhode island': 'RI', 'south carolina': 'SC', + 'south dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT', + 'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', + 'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY', +} + +# Coordinate regex +_COORD_RE = re.compile(r'^\s*(-?\d+\.?\d*)\s*[,\s]\s*(-?\d+\.?\d*)\s*$') + + +# ═══════════════════════════════════════════════════════════════════ +# STEP 1: PREPROCESSING +# ═══════════════════════════════════════════════════════════════════ + +def _parse_coords(text): + """Return (lat, lon) if text looks like coordinates with valid bounds, else None.""" + m = _COORD_RE.match(text.strip()) + if not m: + return None + lat, lon = float(m.group(1)), float(m.group(2)) + if -90 <= lat <= 90 and -180 <= lon <= 180: + return lat, lon + return None + + +def _classify_and_parse(query): + """ + Parse query with usaddress, classify intent, expand abbreviations. + + Returns (intent, parsed_dict) where: + intent: 'ADDRESS' | 'POI' | 'LOCALITY' | 'POSTCODE' | 'COORD' | 'UNKNOWN' + parsed_dict: {number, street, city, state, zipcode, raw_query, expanded_query} + """ + q = query.strip() + parsed = { + 'number': None, 'street': None, 'street_raw': None, + 'city': None, 'state': None, + 'zipcode': None, 'raw_query': q, 'expanded_query': q, + } + + # Coordinate check first + if _parse_coords(q): + return 'COORD', parsed + + # Try usaddress + try: + tagged, addr_type = usaddress.tag(q) + except usaddress.RepeatedLabelError: + # Ambiguous input — fall back to free-text Photon + return 'UNKNOWN', parsed + + # Extract components + number = tagged.get('AddressNumber', '').strip() + street_name = tagged.get('StreetName', '').strip() + street_pre_dir = tagged.get('StreetNamePreDirectional', '').strip() + street_post_type = tagged.get('StreetNamePostType', '').strip() + place = tagged.get('PlaceName', '').strip() + state = tagged.get('StateName', '').strip() + zipcode = tagged.get('ZipCode', '').strip() + + # ── Fix usaddress edge case: "214 N St Filer" ── + # usaddress reads single-letter directional + "St" as PreDirectional + empty, + # mashing "St Filer" into StreetName. Detect: PreDirectional is single letter, + # StreetName has 2+ tokens where the first is a street type. + if (street_pre_dir and len(street_pre_dir) <= 2 + and not street_name.strip().startswith(street_pre_dir) + and ' ' in street_name): + name_tokens = street_name.split() + first_lower = name_tokens[0].lower() + if first_lower in _STREET_TYPE_ABBREVS or first_lower in _STREET_TYPE_ABBREVS.values(): + # "N" is actually the street name, "St" is the post-type + street_name = street_pre_dir + street_post_type = name_tokens[0] + if len(name_tokens) > 1: + place = ' '.join(name_tokens[1:]) + street_pre_dir = '' + + # ── Expand abbreviations (guard ordinals) ── + expanded_parts = [] + + if number: + parsed['number'] = number + expanded_parts.append(number) + + if street_pre_dir: + exp = _DIRECTIONAL_ABBREVS.get(street_pre_dir.lower(), street_pre_dir) + expanded_parts.append(exp) + + if street_name: + # Don't expand ordinals: "21st" stays "21st" + if _ORDINAL_RE.match(street_name): + expanded_parts.append(street_name) + else: + # Expand directional abbreviation if it IS the street name + exp = _DIRECTIONAL_ABBREVS.get(street_name.lower(), street_name) + expanded_parts.append(exp) + parsed['street'] = street_name + + if street_post_type: + if _ORDINAL_RE.match(street_post_type): + expanded_parts.append(street_post_type) + else: + exp = _STREET_TYPE_ABBREVS.get(street_post_type.lower(), street_post_type) + expanded_parts.append(exp) + + # Build raw street (original abbreviations, for Netsyms) and expanded (for Photon) + raw_street_parts = [] + if street_pre_dir: + raw_street_parts.append(street_pre_dir) + if street_name: + raw_street_parts.append(street_name) + if street_post_type: + raw_street_parts.append(street_post_type) + parsed['street_raw'] = ' '.join(raw_street_parts) + + # Build the full expanded street + if expanded_parts: + # The street is everything after the number + street_full = ' '.join(expanded_parts[1:] if number else expanded_parts) + parsed['street'] = street_full + + if place: + parsed['city'] = place + expanded_parts.append(place) + if state: + parsed['state'] = state.upper() + expanded_parts.append(state) + if zipcode: + parsed['zipcode'] = zipcode + expanded_parts.append(zipcode) + + parsed['expanded_query'] = ' '.join(expanded_parts) + + # ── Intent classification ── + if addr_type == 'Street Address' and number: + return 'ADDRESS', parsed + elif zipcode and not number and not street_name: + return 'POSTCODE', parsed + elif addr_type == 'Ambiguous': + # Check if it looks like a locality: last token(s) are a state code or name + tokens = q.replace(',', ' ').split() + if len(tokens) >= 2: + last_upper = tokens[-1].upper() + if last_upper in _STATE_CODES: + parsed['city'] = ' '.join(tokens[:-1]) + parsed['state'] = last_upper + return 'LOCALITY', parsed + # Check full state names (single-word like "idaho" or two-word like "new york") + last_lower = tokens[-1].lower() + if last_lower in _STATE_NAME_TO_CODE: + parsed['city'] = ' '.join(tokens[:-1]) + parsed['state'] = _STATE_NAME_TO_CODE[last_lower] + return 'LOCALITY', parsed + if len(tokens) >= 3: + two_word = f"{tokens[-2].lower()} {last_lower}" + if two_word in _STATE_NAME_TO_CODE: + parsed['city'] = ' '.join(tokens[:-2]) + parsed['state'] = _STATE_NAME_TO_CODE[two_word] + return 'LOCALITY', parsed + return 'UNKNOWN', parsed + else: + return 'UNKNOWN', parsed + + +# ═══════════════════════════════════════════════════════════════════ +# STEP 2: RETRIEVAL +# ═══════════════════════════════════════════════════════════════════ + +def _retrieve_netsyms(parsed, limit=10, lat=None, lon=None): + """Query Netsyms for structured address lookup. Returns list of candidate dicts.""" + try: + from . import netsyms + except Exception: + return [] + + results = [] + number = parsed.get('number', '') + street = parsed.get('street_raw') or parsed.get('street', '') + city = parsed.get('city', '') + state = parsed.get('state', '') + zipcode = parsed.get('zipcode', '') + + # When viewport provided, fetch more results to sort from + fetch_limit = 200 if (lat is not None and lon is not None) else limit + + if number and street: + rows = netsyms.lookup_by_street( + number, street, city=city, state=state, zipcode=zipcode, limit=fetch_limit + ) + elif zipcode: + rows = netsyms.lookup_by_zipcode(zipcode, limit=fetch_limit) + else: + return [] + + for row in rows: + addr_parts = [row['number'], row['street']] + if row.get('street2'): + addr_parts.append(row['street2']) + addr_parts.extend([row['city'], row['state'], row['zipcode']]) + display = ' '.join(p for p in addr_parts if p) + results.append({ + 'name': display, + 'lat': row['lat'], + 'lon': row['lon'], + 'source': 'netsyms', + 'type': 'street_address', + 'raw': row, + '_number': row.get('number', ''), + '_street': row.get('street', ''), + '_city': row.get('city', ''), + '_state': row.get('state', ''), + }) + # Sort by viewport distance if lat/lon provided, then limit + if lat is not None and lon is not None and results: + results.sort(key=lambda r: (r["lat"] - lat)**2 + (r["lon"] - lon)**2) + results = results[:limit] + return results + + +def _retrieve_photon_structured(parsed, limit=10): + """Query Photon /structured endpoint for address lookup.""" + params = {'limit': limit, 'countrycode': 'US'} + if parsed.get('street'): + params['street'] = parsed['street'] + if parsed.get('number'): + params['housenumber'] = parsed['number'] + if parsed.get('city'): + params['city'] = parsed['city'] + if parsed.get('state'): + params['state'] = parsed['state'] + + if 'street' not in params: + return [] + + try: + resp = requests.get(f"{PHOTON_URL}/structured", params=params, timeout=5) + resp.raise_for_status() + data = resp.json() + except Exception as e: + logger.debug("Photon /structured failed: %s", e) + return [] + + return _parse_photon_features(data.get('features', []), 'photon') + + +def _retrieve_photon_freetext(query, limit=10, lat=None, lon=None, zoom=None): + """Query Photon /api for free-text search with location bias.""" + try: + params = { + 'q': query, + 'limit': limit, + 'lat': lat if lat is not None else GEOCODE_BIAS_LAT, + 'lon': lon if lon is not None else GEOCODE_BIAS_LON, + 'zoom': int(zoom) if zoom is not None else GEOCODE_BIAS_ZOOM, + } + resp = requests.get(f"{PHOTON_URL}/api", params=params, timeout=5) + resp.raise_for_status() + data = resp.json() + except Exception as e: + return [] + + return _parse_photon_features(data.get('features', []), 'photon') + + +def _parse_photon_features(features, source): + """Convert Photon GeoJSON features to candidate dicts.""" + results = [] + for i, feature in enumerate(features): + props = feature.get('properties', {}) + coords = feature.get('geometry', {}).get('coordinates', [0, 0]) + + osm_key = props.get('osm_key', '') + osm_value = props.get('osm_value', '') + feat_type = props.get('type', '') + has_hn = bool(props.get('housenumber')) + + if osm_key in ('amenity', 'shop', 'tourism', 'leisure', 'office'): + rtype = 'poi' + elif has_hn or osm_value in ('house', 'residential'): + rtype = 'street_address' + elif feat_type in ('city', 'town', 'village', 'hamlet', 'county', 'state', 'country'): + rtype = 'locality' + else: + rtype = 'poi' + + # Build display name + parts = [] + hn = props.get('housenumber') + street = props.get('street') + name = props.get('name', '') + if hn and street: + parts.append(f"{hn} {street}") + if name and name != street: + parts.append(name) + elif name: + parts.append(name) + elif street: + parts.append(street) + for key in ('city', 'county', 'state', 'country'): + v = props.get(key) + if v and (not parts or v != parts[-1]): + parts.append(v) + display = ', '.join(p for p in parts if p) or 'Unknown' + + results.append({ + 'name': display, + 'lat': coords[1], + 'lon': coords[0], + 'source': source, + 'type': rtype, + 'raw': props, + '_photon_rank': i, + '_number': props.get('housenumber', ''), + '_street': props.get('street', ''), + # For locality results, the name IS the city (Photon omits 'city' on city-type features) + '_city': props.get('city', '') or (props.get('name', '') if rtype == 'locality' else ''), + '_state': props.get('state', ''), + }) + return results + + +# ═══════════════════════════════════════════════════════════════════ +# STEP 3: RERANKER +# ═══════════════════════════════════════════════════════════════════ + +def _expand_street_type(s): + """Expand a street type abbreviation for comparison.""" + return _STREET_TYPE_ABBREVS.get(s.lower(), s.lower()) + + +def _score_candidate(candidate, parsed, intent): + """ + Score a candidate against the parsed query. + Returns (total_score, signal_breakdown_dict). + """ + signals = {} + total = 0.0 + + query_number = (parsed.get('number') or '').strip().upper() + query_street = (parsed.get('street') or '').strip().upper() + query_city = (parsed.get('city') or '').strip().upper() + query_state = (parsed.get('state') or '').strip().upper() + + cand_number = (candidate.get('_number') or '').strip().upper() + cand_street = (candidate.get('_street') or '').strip().upper() + cand_city = (candidate.get('_city') or '').strip().upper() + cand_state = (candidate.get('_state') or '').strip().upper() + + # ── Housenumber ── + if intent == 'ADDRESS' and query_number: + if cand_number == query_number: + signals['housenumber_exact'] = W_HOUSENUMBER_EXACT + total += W_HOUSENUMBER_EXACT + elif cand_number and cand_number != query_number: + signals['housenumber_mismatch'] = W_HOUSENUMBER_MISMATCH + total += W_HOUSENUMBER_MISMATCH + + # ── Street name fuzz ── + if query_street and cand_street: + # Expand both for comparison + q_expanded = ' '.join(_expand_street_type(t) for t in query_street.split()) + c_expanded = ' '.join(_expand_street_type(t) for t in cand_street.split()) + ratio = fuzz.token_sort_ratio(q_expanded, c_expanded) / 100.0 + score = ratio * W_STREET_NAME_FUZZ + signals['street_name_fuzz'] = round(score, 2) + total += score + + # ── Street type match ── + if query_street and cand_street: + q_tokens = set(_expand_street_type(t) for t in query_street.split()) + c_tokens = set(_expand_street_type(t) for t in cand_street.split()) + # Check if the street type words overlap + street_types = set(_STREET_TYPE_ABBREVS.values()) + q_types = q_tokens & street_types + c_types = c_tokens & street_types + if q_types and q_types & c_types: + signals['street_type_match'] = W_STREET_TYPE_MATCH + total += W_STREET_TYPE_MATCH + + # ── Token coverage ── + raw_q = parsed.get('raw_query', '').upper() + q_tokens = set(raw_q.replace(',', ' ').split()) + if q_tokens: + cand_text = candidate.get('name', '').upper() + matched = sum(1 for t in q_tokens if t in cand_text) + coverage = matched / len(q_tokens) + score = coverage * W_TOKEN_COVERAGE + signals['token_coverage'] = round(score, 2) + total += score + + # ── Locality fuzz ── + if query_city and cand_city: + ratio = fuzz.ratio(query_city, cand_city) / 100.0 + score = ratio * W_LOCALITY_FUZZ + signals['locality_fuzz'] = round(score, 2) + total += score + + # ── State exact ── + if query_state and cand_state: + if cand_state == query_state: + signals['state_exact'] = W_STATE_EXACT + total += W_STATE_EXACT + + # ── Source authority ── + if candidate.get('source') == 'netsyms' and intent == 'ADDRESS': + signals['source_authority'] = W_SOURCE_AUTHORITY + total += W_SOURCE_AUTHORITY + + # ── Layer rank (type-appropriate bonus) ── + cand_type = candidate.get('type', '') + if intent == 'ADDRESS' and cand_type == 'street_address': + signals['layer_rank'] = W_LAYER_RANK + total += W_LAYER_RANK + elif intent == 'LOCALITY' and cand_type == 'locality': + signals['layer_rank'] = W_LAYER_RANK + total += W_LAYER_RANK + elif intent == 'POI' and cand_type == 'poi': + signals['layer_rank'] = W_LAYER_RANK + total += W_LAYER_RANK + + # ── Photon position normalization ── + photon_rank = candidate.get('_photon_rank') + if photon_rank is not None: + # Top result gets full bonus, decays linearly + score = max(0, (1.0 - photon_rank / 10.0)) * W_PHOTON_POSITION_NORM + signals['photon_position'] = round(score, 2) + total += score + + # ── Business intent POI boost ── + # When the query has no road keywords (likely a business/POI search), + # boost amenity/shop/etc results and penalize highway/route results. + # Skipped for LOCALITY, POSTCODE, COORD queries where class is irrelevant. + if intent not in ('LOCALITY', 'POSTCODE', 'COORD'): + q_tokens_lower = set(parsed.get('raw_query', '').lower().replace(',', ' ').split()) + if not (q_tokens_lower & _ROAD_KEYWORDS): + osm_key = (candidate.get('raw') or {}).get('osm_key', '') + if osm_key in ('amenity', 'shop', 'tourism', 'leisure', 'office', 'craft'): + signals['poi_class_boost'] = W_POI_CLASS_BOOST + total += W_POI_CLASS_BOOST + elif osm_key in ('highway', 'route'): + signals['highway_class_penalty'] = W_HIGHWAY_CLASS_PENALTY + total += W_HIGHWAY_CLASS_PENALTY + + return round(total, 2), signals + + +def _build_match_code(candidate, parsed, intent): + """Build a match_code dict indicating match quality for each field.""" + mc = {} + if intent == 'ADDRESS': + q_num = (parsed.get('number') or '').strip().upper() + c_num = (candidate.get('_number') or '').strip().upper() + if q_num and c_num == q_num: + mc['housenumber'] = 'matched' + elif q_num and c_num: + mc['housenumber'] = 'unmatched' + elif q_num and not c_num: + mc['housenumber'] = 'inferred' + + q_street = (parsed.get('street') or '').strip().upper() + c_street = (candidate.get('_street') or '').strip().upper() + if q_street and c_street: + q_exp = ' '.join(_expand_street_type(t) for t in q_street.split()) + c_exp = ' '.join(_expand_street_type(t) for t in c_street.split()) + ratio = fuzz.token_sort_ratio(q_exp, c_exp) / 100.0 + mc['street'] = 'matched' if ratio > 0.8 else 'unmatched' + elif q_street: + mc['street'] = 'inferred' + + q_city = (parsed.get('city') or '').strip().upper() + c_city = (candidate.get('_city') or '').strip().upper() + if q_city and c_city: + ratio = fuzz.ratio(q_city, c_city) / 100.0 + mc['city'] = 'matched' if ratio > 0.8 else 'unmatched' + elif q_city: + mc['city'] = 'inferred' + + return mc + + +def _rerank(candidates, parsed, intent, query, limit): + """Score, sort, and trim candidates. Trace-log top 3.""" + scored = [] + for c in candidates: + total, signals = _score_candidate(c, parsed, intent) + c['_score'] = total + c['_signals'] = signals + scored.append(c) + + scored.sort(key=lambda c: c['_score'], reverse=True) + + # Trace log for audit + _trace_logger.debug("─── Query: %r intent=%s ───", query, intent) + for i, c in enumerate(scored): + osm_key = (c.get('raw') or {}).get('osm_key', '—') + osm_val = (c.get('raw') or {}).get('osm_value', '—') + _trace_logger.debug( + " #%d score=%.2f src=%s key=%s/%s name=%s", + i, c['_score'], c.get('source', '?'), osm_key, osm_val, + c.get('name', '?')[:60] + ) + _trace_logger.debug(" signals=%s", c.get('_signals', {})) + + # Clean internal fields and add match_code + result = [] + for c in scored[:limit]: + mc = _build_match_code(c, parsed, intent) + + # Assign confidence from score + score = c.get('_score', 0) + if score >= 10: + confidence = 'exact' + elif score >= 5: + confidence = 'high' + elif score >= 2: + confidence = 'medium' + else: + confidence = 'low' + + entry = { + 'name': c['name'], + 'lat': c['lat'], + 'lon': c['lon'], + 'source': c['source'], + 'confidence': confidence, + 'type': c.get('type', 'poi'), + 'raw': c.get('raw'), + } + if mc: + entry['match_code'] = mc + result.append(entry) + + return result + + +# ═══════════════════════════════════════════════════════════════════ +# STEP 4: ANNOTATION +# ═══════════════════════════════════════════════════════════════════ + +def _haversine_m(lat1, lon1, lat2, lon2): + """Haversine distance in meters.""" + R = 6_371_000 + rlat1, rlat2 = math.radians(lat1), math.radians(lat2) + dlat = math.radians(lat2 - lat1) + dlon = math.radians(lon2 - lon1) + a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 + return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) + + +def _annotate_with_address_book(results): + """Add labeled_as to results within radius of an address book entry.""" + try: + from . import address_book + entries = address_book.load() + except Exception: + return + for result in results: + rlat, rlon = result.get('lat'), result.get('lon') + if rlat is None or rlon is None: + continue + for entry in entries: + elat, elon = entry.get('lat'), entry.get('lon') + if elat is None or elon is None: + continue + if _haversine_m(rlat, rlon, elat, elon) <= ADDRESS_BOOK_ANNOTATION_RADIUS_M: + result['labeled_as'] = entry['name'] + break + + +# ═══════════════════════════════════════════════════════════════════ +# PUBLIC API +# ═══════════════════════════════════════════════════════════════════ + +def geocode(query, limit=10, lat=None, lon=None, zoom=None): + """ + Structured geocoding with multi-source retrieval and reranking. + + Returns {query, results: [...], count} — always 200-safe. + """ + limit = max(1, min(limit, 20)) + q = (query or '').strip() + empty = {'query': q, 'results': [], 'count': 0} + + if not q: + return empty + + # ── Coordinate detection ── + coords = _parse_coords(q) + if coords: + return { + 'query': q, + 'results': [{ + 'name': q, + 'lat': coords[0], + 'lon': coords[1], + 'source': 'coordinates', + 'confidence': 'exact', + 'type': 'coordinates', + 'raw': None, + }], + 'count': 1, + } + + # ── Address book nickname short-circuit ── + normalized_q = ' '.join(q.lower().replace(',', ' ').split()) + is_single_word = ' ' not in normalized_q + try: + from . import address_book + ab_match = address_book.lookup(q) + if (ab_match + and ab_match['confidence'] == 'exact' + and ab_match.get('lat') and ab_match.get('lon') + and is_single_word): + logger.info("geocode: nickname short-circuit %r → %s", q, ab_match['name']) + return { + 'query': q, + 'results': [{ + 'name': ab_match.get('address') or ab_match['name'], + 'lat': ab_match['lat'], + 'lon': ab_match['lon'], + 'source': 'address_book', + 'confidence': 'exact', + 'type': 'nickname', + 'raw': ab_match, + }], + 'count': 1, + } + except Exception as e: + logger.debug("geocode: address_book lookup failed: %s", e) + + # ── Classify intent + parse ── + intent, parsed = _classify_and_parse(q) + logger.debug("geocode: intent=%s parsed=%s", intent, parsed) + + # ── Retrieve candidates ── + candidates = [] + + if intent == 'ADDRESS': + # Parallel: Netsyms (structured) + Photon (freetext with expanded query) + netsyms_results = _retrieve_netsyms(parsed, limit=limit, lat=lat, lon=lon) + photon_results = _retrieve_photon_freetext( + parsed.get('expanded_query', q), limit=limit, lat=lat, lon=lon, zoom=zoom + ) + # Also try Photon /structured for addresses + photon_struct = _retrieve_photon_structured(parsed, limit=5) + candidates = netsyms_results + photon_results + photon_struct + + elif intent == 'POSTCODE': + netsyms_results = _retrieve_netsyms(parsed, limit=limit, lat=lat, lon=lon) + photon_results = _retrieve_photon_freetext(q, limit=limit, lat=lat, lon=lon, zoom=zoom) + candidates = netsyms_results + photon_results + + elif intent in ('LOCALITY', 'POI', 'UNKNOWN'): + candidates = _retrieve_photon_freetext(q, limit=limit, lat=lat, lon=lon, zoom=zoom) + + # ── Deduplicate by (lat, lon) proximity ── + deduped = [] + for c in candidates: + is_dup = False + for existing in deduped: + if (_haversine_m(c['lat'], c['lon'], existing['lat'], existing['lon']) < 50 + and c.get('source') == existing.get('source')): + is_dup = True + break + if not is_dup: + deduped.append(c) + candidates = deduped + + # ── Rerank ── + results = _rerank(candidates, parsed, intent, q, limit) + + # ── Address book annotation ── + _annotate_with_address_book(results) + + logger.info("geocode: %r → intent=%s, %d results", q, intent, len(results)) + return {'query': q, 'results': results, 'count': len(results)} diff --git a/lib/geocode_test.py b/lib/geocode_test.py new file mode 100644 index 0000000..4717b1e --- /dev/null +++ b/lib/geocode_test.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +"""Tests for RECON Photon-first geocode chain.""" +import sys +import os +import json +import urllib.request +import urllib.parse + +BASE = "http://localhost:8420" + +TESTS = [ + { + "name": "home → nickname short-circuit", + "query": "home", + "check": lambda r: ( + r["count"] == 1 + and r["results"][0]["source"] == "address_book" + and r["results"][0]["confidence"] == "exact" + and r["results"][0]["type"] == "nickname" + ), + }, + { + "name": "214 north st filer → netsyms exact match (multi-word, not nickname)", + "query": "214 north st filer", + "check": lambda r: ( + r["count"] >= 1 + and r["results"][0]["source"] == "netsyms" + and r["results"][0]["confidence"] == "exact" + and r["results"][0]["type"] == "street_address" + ), + }, + { + "name": "214 North St, Filer, ID → netsyms (case/punctuation)", + "query": "214 North St, Filer, ID", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms", + }, + { + "name": "214 NORTH ST FILER ID → netsyms (uppercase)", + "query": "214 NORTH ST FILER ID", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms", + }, + { + "name": "1600 Pennsylvania Ave Washington DC → White House", + "query": "1600 Pennsylvania Ave Washington DC", + "check": lambda r: ( + r["count"] >= 1 + and r["results"][0]["source"] == "photon" + ), + }, + { + "name": "1600 pennsylvania ave washington dc → lowercase", + "query": "1600 pennsylvania ave washington dc", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", + }, + { + "name": "starbucks filer → POI result", + "query": "starbucks filer", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", + }, + { + "name": "filer idaho → locality", + "query": "filer idaho", + "check": lambda r: ( + r["count"] >= 1 + and r["results"][0]["source"] == "photon" + and r["results"][0]["type"] == "locality" + ), + }, + { + "name": "filer → partial query, at least 1 result", + "query": "filer", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", + }, + { + "name": "42.5736, -114.6066 → coordinates (with space)", + "query": "42.5736, -114.6066", + "check": lambda r: ( + r["count"] == 1 + and r["results"][0]["source"] == "coordinates" + and r["results"][0]["confidence"] == "exact" + and r["results"][0]["type"] == "coordinates" + ), + }, + { + "name": "42.5736,-114.6066 → coordinates (no space)", + "query": "42.5736,-114.6066", + "check": lambda r: ( + r["count"] == 1 + and r["results"][0]["source"] == "coordinates" + and r["results"][0]["confidence"] == "exact" + ), + }, + { + "name": "boise → at least 1 result", + "query": "boise", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", + }, + { + "name": "toronto → CA canary", + "query": "toronto", + "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", + }, + { + "name": "asdfghjklqwerty → empty results, 200 OK", + "query": "asdfghjklqwerty", + "check": lambda r: r["count"] == 0 and r["results"] == [], + }, + { + "name": "empty query → empty results", + "query": "", + "check": lambda r: r["count"] == 0 and r["results"] == [], + }, +] + +passed = 0 +failed = 0 + +for t in TESTS: + q = urllib.parse.urlencode({"q": t["query"]}) if t["query"] else "q=" + url = f"{BASE}/api/geocode?{q}" + try: + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=10) as resp: + status = resp.status + body = json.loads(resp.read()) + except urllib.error.HTTPError as e: + status = e.code + try: + body = json.loads(e.read()) + except Exception: + body = {} + except Exception as e: + status = 0 + body = {} + print(f" [FAIL] {t['name']}") + print(f" EXCEPTION: {e}") + failed += 1 + continue + + ok = status == 200 and t["check"](body) + tag = "PASS" if ok else "FAIL" + if ok: + passed += 1 + else: + failed += 1 + + top = body.get("results", [{}])[0] if body.get("results") else {} + top_summary = f"source={top.get('source','—')} type={top.get('type','—')} conf={top.get('confidence','—')} name={top.get('name','—')[:50]}" + print(f" [{tag}] {t['name']}") + if not ok: + print(f" HTTP {status}, count={body.get('count','?')}, top: {top_summary}") + else: + labeled = f" labeled_as={top.get('labeled_as')}" if top.get('labeled_as') else "" + print(f" → {top_summary}{labeled}") + +print(f"\n{passed} passed, {failed} failed") +sys.exit(0 if failed == 0 else 1) diff --git a/lib/google_places.py b/lib/google_places.py new file mode 100644 index 0000000..8272b81 --- /dev/null +++ b/lib/google_places.py @@ -0,0 +1,397 @@ +""" +Google Places (New) API client for tertiary enrichment. + +Searches for business POIs and fetches details (opening hours, phone, website) +when OSM + Overture data is incomplete. Uses field masks to minimize cost. + +API docs: https://developers.google.com/maps/documentation/places/web-service +""" +import json +import os +import sqlite3 +import time +from datetime import date, timezone, datetime + +import requests + +from .utils import setup_logging + +logger = setup_logging('recon.google_places') + +API_BASE = 'https://places.googleapis.com/v1' +DEFAULT_DAILY_CAP = 500 +REQUEST_TIMEOUT = 3 # seconds + +# Google day index → OSM abbreviation +_DAY_ABBR = ['Su', 'Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa'] + +_db_conn = None + + +def _get_db(): + """Return a module-level SQLite connection (lazy init).""" + global _db_conn + if _db_conn is not None: + return _db_conn + + db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') + db_path = os.path.join(db_dir, 'place_cache.db') + _db_conn = sqlite3.connect(db_path, check_same_thread=False) + _db_conn.execute("PRAGMA journal_mode=WAL") + _db_conn.execute("PRAGMA synchronous=NORMAL") + # Ensure google_api_calls table exists + _db_conn.execute(""" + CREATE TABLE IF NOT EXISTS google_api_calls ( + call_date TEXT PRIMARY KEY, + call_count INTEGER NOT NULL DEFAULT 0 + ) + """) + _db_conn.commit() + return _db_conn + + +def _get_api_key(): + """Return the Google Places API key from environment.""" + key = os.environ.get('GOOGLE_PLACES_API_KEY') + if not key: + logger.error("GOOGLE_PLACES_API_KEY not set in environment") + return key + + +def _get_daily_cap(): + """Return the daily API call cap (configurable via deployment config).""" + try: + from .deployment_config import get_deployment_config + config = get_deployment_config() + return config.get('google_places', {}).get('daily_cap', DEFAULT_DAILY_CAP) + except Exception: + return DEFAULT_DAILY_CAP + + +# ── Daily call counter ────────────────────────────────────────────────── + +def check_daily_cap(): + """Return True if under daily cap, False if limit reached.""" + db = _get_db() + today = date.today().isoformat() + row = db.execute( + "SELECT call_count FROM google_api_calls WHERE call_date = ?", (today,) + ).fetchone() + current = row[0] if row else 0 + cap = _get_daily_cap() + if current >= cap: + logger.info(f"google_places: daily_cap_reached count={current} cap={cap}") + return False + return True + + +def get_daily_count(): + """Return today's API call count.""" + db = _get_db() + today = date.today().isoformat() + row = db.execute( + "SELECT call_count FROM google_api_calls WHERE call_date = ?", (today,) + ).fetchone() + return row[0] if row else 0 + + +def increment_call_counter(): + """Atomically increment today's API call counter.""" + db = _get_db() + today = date.today().isoformat() + db.execute(""" + INSERT INTO google_api_calls (call_date, call_count) VALUES (?, 1) + ON CONFLICT(call_date) DO UPDATE SET call_count = call_count + 1 + """, (today,)) + db.commit() + + +def _set_daily_count_to_cap(): + """Set today's counter to the cap value (soft-stop on quota error).""" + db = _get_db() + today = date.today().isoformat() + cap = _get_daily_cap() + db.execute(""" + INSERT INTO google_api_calls (call_date, call_count) VALUES (?, ?) + ON CONFLICT(call_date) DO UPDATE SET call_count = ? + """, (today, cap, cap)) + db.commit() + + +# ── Google Places cache (on place_cache table) ───────────────────────── + +def cache_get_google(osm_type, osm_id): + """Return (google_place_id, google_data_dict) or (None, None).""" + db = _get_db() + row = db.execute( + "SELECT google_place_id, google_data FROM place_cache WHERE osm_type=? AND osm_id=?", + (osm_type, osm_id) + ).fetchone() + if row and row[0]: + data = None + if row[1]: + try: + data = json.loads(row[1]) + except (json.JSONDecodeError, TypeError): + pass + return row[0], data + return None, None + + +def cache_put_google(osm_type, osm_id, place_id, data): + """Store Google Places data for a cache entry (UPSERT on google columns).""" + db = _get_db() + now = int(time.time()) + db.execute(""" + INSERT INTO place_cache (osm_type, osm_id, data, source, cached_at, google_place_id, google_data, google_fetched_at) + VALUES (?, ?, '', 'pending', 0, ?, ?, ?) + ON CONFLICT(osm_type, osm_id) DO UPDATE SET + google_place_id = excluded.google_place_id, + google_data = excluded.google_data, + google_fetched_at = excluded.google_fetched_at + """, (osm_type, osm_id, place_id, json.dumps(data) if data else None, now)) + db.commit() + + +# ── API calls ─────────────────────────────────────────────────────────── + +def search_place(name, lat, lon, radius_m=200): + """ + Search Google Places (New) for a business by name + location. + Returns the Google Place ID of the best match, or None. + """ + key = _get_api_key() + if not key: + return None + + if not check_daily_cap(): + return None + + try: + resp = requests.post( + f'{API_BASE}/places:searchText', + headers={ + 'Content-Type': 'application/json', + 'X-Goog-Api-Key': key, + 'X-Goog-FieldMask': 'places.id,places.displayName,places.location', + }, + json={ + 'textQuery': name, + 'locationBias': { + 'circle': { + 'center': {'latitude': lat, 'longitude': lon}, + 'radius': float(radius_m), + } + }, + 'maxResultCount': 1, + }, + timeout=REQUEST_TIMEOUT, + ) + + increment_call_counter() + + if resp.status_code == 429: + logger.warning("google_places: action=search place=%s result=rate_limited", name) + _set_daily_count_to_cap() + return None + + if resp.status_code == 403: + logger.error("google_places: action=search place=%s result=forbidden (invalid key?)", name) + return None + + if resp.status_code != 200: + logger.warning("google_places: action=search place=%s result=error status=%d", name, resp.status_code) + return None + + data = resp.json() + places = data.get('places', []) + if not places: + logger.info("google_places: action=search place=%s result=miss", name) + return None + + place_id = places[0].get('id') + display = places[0].get('displayName', {}).get('text', '?') + logger.info("google_places: action=search place=%s result=hit google_name=%s id=%s", name, display, place_id) + return place_id + + except requests.exceptions.Timeout: + logger.warning("google_places: action=search place=%s result=timeout", name) + return None + except Exception as e: + logger.error("google_places: action=search place=%s result=error err=%s", name, e) + return None + + +def get_place_details(place_id): + """ + Fetch details for a Google Place ID. + Returns dict with {opening_hours, phone_number, website} or None. + """ + key = _get_api_key() + if not key: + return None + + if not check_daily_cap(): + return None + + try: + resp = requests.get( + f'{API_BASE}/places/{place_id}', + headers={ + 'X-Goog-Api-Key': key, + 'X-Goog-FieldMask': 'regularOpeningHours,internationalPhoneNumber,websiteUri', + }, + timeout=REQUEST_TIMEOUT, + ) + + increment_call_counter() + + if resp.status_code == 429: + logger.warning("google_places: action=details id=%s result=rate_limited", place_id) + _set_daily_count_to_cap() + return None + + if resp.status_code != 200: + logger.warning("google_places: action=details id=%s result=error status=%d", place_id, resp.status_code) + return None + + data = resp.json() + result = { + 'opening_hours': None, + 'opening_hours_raw': None, + 'phone_number': None, + 'website': None, + } + + # Phone + phone = data.get('internationalPhoneNumber') + if phone: + result['phone_number'] = phone.replace(' ', '').replace('-', '') + + # Website + result['website'] = data.get('websiteUri') + + # Opening hours + hours = data.get('regularOpeningHours') + if hours: + # Try OSM-compatible format from periods + periods = hours.get('periods', []) + if periods: + osm_str = _periods_to_osm(periods) + if osm_str: + result['opening_hours'] = osm_str + + # Fallback: weekday descriptions (human-readable) + if not result['opening_hours']: + descriptions = hours.get('weekdayDescriptions') + if descriptions: + result['opening_hours_raw'] = descriptions + + logger.info("google_places: action=details id=%s result=hit hours=%s phone=%s website=%s", + place_id, + 'yes' if result['opening_hours'] or result['opening_hours_raw'] else 'no', + 'yes' if result['phone_number'] else 'no', + 'yes' if result['website'] else 'no') + return result + + except requests.exceptions.Timeout: + logger.warning("google_places: action=details id=%s result=timeout", place_id) + return None + except Exception as e: + logger.error("google_places: action=details id=%s result=error err=%s", place_id, e) + return None + + +# ── Opening hours conversion ──────────────────────────────────────────── + +def _periods_to_osm(periods): + """ + Convert Google Places periods array to OSM opening_hours string. + + Google periods: [{"open": {"day": 0-6, "hour": H, "minute": M}, + "close": {"day": 0-6, "hour": H, "minute": M}}, ...] + Where day 0 = Sunday. + + OSM format: "Mo-Fr 06:00-23:00; Sa-Su 07:00-23:00" + """ + if not periods: + return None + + # Check for 24/7: single period with no close, or open 00:00 close 00:00 next day + if len(periods) == 1: + p = periods[0] + o = p.get('open', {}) + c = p.get('close') + if c is None and o.get('hour', 0) == 0 and o.get('minute', 0) == 0: + return '24/7' + + # Build a map: day_index → "HH:MM-HH:MM" + day_hours = {} # day_index → time_range string + for p in periods: + o = p.get('open', {}) + c = p.get('close', {}) + day = o.get('day', 0) + open_time = f"{o.get('hour', 0):02d}:{o.get('minute', 0):02d}" + + if c: + close_time = f"{c.get('hour', 0):02d}:{c.get('minute', 0):02d}" + # Handle midnight closing (00:00 means end of day) + if close_time == '00:00': + close_time = '24:00' + else: + close_time = '24:00' + + time_range = f"{open_time}-{close_time}" + + # A day can have multiple periods (e.g., lunch break) + if day in day_hours: + day_hours[day] = day_hours[day] + ',' + time_range + else: + day_hours[day] = time_range + + if not day_hours: + return None + + # Check if all 7 days have same hours + unique_ranges = set(day_hours.values()) + if len(day_hours) == 7 and len(unique_ranges) == 1: + hours = unique_ranges.pop() + if hours == '00:00-24:00': + return '24/7' + return hours # implicit "every day" + + # Group consecutive days with same hours + # Reorder to OSM convention: Mo(1) Tu(2) We(3) Th(4) Fr(5) Sa(6) Su(0) + osm_day_order = [1, 2, 3, 4, 5, 6, 0] + groups = [] + current_days = [] + current_hours = None + + for day_idx in osm_day_order: + hours = day_hours.get(day_idx) + if hours == current_hours: + current_days.append(day_idx) + else: + if current_days and current_hours: + groups.append((current_days, current_hours)) + current_days = [day_idx] + current_hours = hours + + if current_days and current_hours: + groups.append((current_days, current_hours)) + + if not groups: + return None + + # Format each group + parts = [] + for days, hours in groups: + if len(days) == 1: + day_str = _DAY_ABBR[days[0]] + elif len(days) == 2: + day_str = f"{_DAY_ABBR[days[0]]},{_DAY_ABBR[days[1]]}" + else: + day_str = f"{_DAY_ABBR[days[0]]}-{_DAY_ABBR[days[-1]]}" + parts.append(f"{day_str} {hours}") + + return '; '.join(parts) diff --git a/lib/landclass.py b/lib/landclass.py new file mode 100644 index 0000000..f581994 --- /dev/null +++ b/lib/landclass.py @@ -0,0 +1,252 @@ +""" +PAD-US land classification lookup. + +Provides point-in-polygon queries against the USGS Protected Areas Database +(PAD-US) stored in a local PostGIS database. Returns land ownership, +management, and public access information for any lat/lon coordinate. + +Connection pool is lazy-initialized on first call. If PostgreSQL is unreachable, +functions return empty results gracefully (feature degrades, doesn't crash). +""" +import os + +import psycopg2 +import psycopg2.pool + +from .utils import setup_logging + +logger = setup_logging('recon.landclass') + +_pool = None +_pool_failed = False + +# ── Label mappings from PAD-US domain tables ──────────────────────────── +# Extracted from PADUS4_0_Geodatabase.gdb domain lookup layers. +# ogr2ogr lowercases all column names. + +AGENCY_NAME_MAP = { + 'TVA': 'Tennessee Valley Authority', + 'BLM': 'Bureau of Land Management', + 'BOEM': 'Bureau of Ocean Energy Management', + 'USBR': 'Bureau of Reclamation', + 'FWS': 'U.S. Fish and Wildlife Service', + 'USFS': 'Forest Service', + 'DOD': 'Department of Defense', + 'USACE': 'Army Corps of Engineers', + 'DOE': 'Department of Energy', + 'NPS': 'National Park Service', + 'NRCS': 'Natural Resources Conservation Service', + 'ARS': 'Agricultural Research Service', + 'BIA': 'Bureau of Indian Affairs', + 'NOAA': 'National Oceanic and Atmospheric Administration', + 'BPA': 'Bonneville Power Administration', + 'OTHF': 'Other or Unknown Federal Land', + 'TRIB': 'American Indian Lands', + 'SPR': 'State Park and Recreation', + 'SDC': 'State Department of Conservation', + 'SLB': 'State Land Board', +} + +AGENCY_TYPE_MAP = { + 'FED': 'Federal', + 'TRIB': 'American Indian Lands', + 'STAT': 'State', + 'DIST': 'Regional Agency Special District', + 'LOC': 'Local Government', + 'NGO': 'Non-Governmental Organization', + 'PVT': 'Private', + 'JNT': 'Joint', + 'UNK': 'Unknown', + 'TERR': 'Territorial', + 'DESG': 'Designation', +} + +DESIGNATION_TYPE_MAP = { + 'NP': 'National Park', + 'NM': 'National Monument', + 'NCA': 'Conservation Area', + 'NF': 'National Forest', + 'NG': 'National Grassland', + 'PUB': 'National Public Lands', + 'NT': 'National Scenic or Historic Trail', + 'NWR': 'National Wildlife Refuge', + 'WA': 'Wilderness Area', + 'WSR': 'Wild and Scenic River', + 'WSA': 'Wilderness Study Area', + 'MPA': 'Marine Protected Area', + 'NRA': 'National Recreation Area', + 'NSBV': 'National Scenic, Botanical or Volcanic Area', + 'NLS': 'National Lakeshore or Seashore', + 'IRA': 'Inventoried Roadless Area', + 'ACEC': 'Area of Critical Environmental Concern', + 'RNA': 'Research Natural Area', + 'REC': 'Recreation Management Area', + 'RMA': 'Resource Management Area', + 'WPA': 'Watershed Protection Area', + 'REA': 'Research or Educational Area', + 'HCA': 'Historic or Cultural Area', + 'MIT': 'Mitigation Land or Bank', + 'MIL': 'Military Land', + 'ACC': 'Access Area', + 'SDA': 'Special Designation Area', + 'PROC': 'Approved or Proclamation Boundary', + 'FOTH': 'Federal Other or Unknown', + 'ND': 'Not Designated', +} + +PUBLIC_ACCESS_MAP = { + 'OA': 'Open Access', + 'RA': 'Restricted Access', + 'XA': 'Closed', + 'UK': 'Unknown', +} + +GAP_STATUS_MAP = { + '1': 'Managed for biodiversity (disturbance events proceed)', + '2': 'Managed for biodiversity (disturbance suppressed)', + '3': 'Multiple uses (extractive/OHV)', + '4': 'No known mandate for biodiversity protection', +} + +CATEGORY_MAP = { + 'Fee': 'Fee', + 'Easement': 'Easement', + 'Other': 'Other', + 'Unknown': 'Unknown', + 'Designation': 'Designation', + 'Marine': 'Marine Area', + 'Proclamation': 'Approved, Proclamation or Extent Boundary', +} + +STATE_MAP = { + 'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', + 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', + 'DC': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', + 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', + 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', + 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', + 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', + 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', + 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', + 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', + 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', + 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', + 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming', +} + + +def _decode(code, label_map): + """Decode a PAD-US code using a label map. Returns decoded label or the raw code.""" + if not code: + return '' + code = str(code).strip() + return label_map.get(code, code) + + +def _get_pool(): + """Lazy-init the connection pool. Returns None if Postgres is unreachable.""" + global _pool, _pool_failed + if _pool is not None: + return _pool + if _pool_failed: + return None + + try: + _pool = psycopg2.pool.SimpleConnectionPool( + minconn=1, + maxconn=3, + host=os.environ.get('PADUS_DB_HOST', 'localhost'), + port=int(os.environ.get('PADUS_DB_PORT', '5432')), + dbname=os.environ.get('PADUS_DB_NAME', 'padus'), + user=os.environ.get('PADUS_DB_USER', 'overture'), + password=os.environ.get('PADUS_DB_PASSWORD', ''), + connect_timeout=5, + ) + logger.info("PAD-US PostgreSQL connection pool initialized") + return _pool + except Exception as e: + _pool_failed = True + logger.warning(f"PAD-US PostgreSQL unavailable, land classification disabled: {e}") + return None + + +def _query_all(sql, params): + """Execute a query and return all rows as a list of dicts, or empty list.""" + pool = _get_pool() + if pool is None: + return [] + + conn = None + try: + conn = pool.getconn() + with conn.cursor() as cur: + cur.execute(sql, params) + rows = cur.fetchall() + if not rows: + return [] + cols = [desc[0] for desc in cur.description] + return [dict(zip(cols, row)) for row in rows] + except Exception as e: + logger.warning(f"PAD-US query error: {e}") + if conn: + try: + conn.rollback() + except Exception: + pass + return [] + finally: + if conn: + try: + pool.putconn(conn) + except Exception: + pass + + +def lookup_landclass(lat, lon): + """ + Look up PAD-US land classifications for a point. + + Returns a list of classification dicts, ordered by area ascending + (smallest/most specific first). Empty list on error or no results. + """ + rows = _query_all( + """SELECT unit_nm, mang_name, mang_type, own_name, own_type, + des_tp, gap_sts, pub_access, category, gis_acres, state_nm + FROM pad_units + WHERE ST_Intersects(geom, ST_SetSRID(ST_MakePoint(%s, %s), 4326)) + ORDER BY gis_acres ASC + LIMIT 10""", + (lon, lat) + ) + + results = [] + for row in rows: + pa_code = str(row.get('pub_access', '')).strip() + + results.append({ + 'unit_name': (row.get('unit_nm') or '').strip(), + 'manager_name': _decode(row.get('mang_name'), AGENCY_NAME_MAP), + 'manager_type': _decode(row.get('mang_type'), AGENCY_TYPE_MAP), + 'owner_type': _decode(row.get('own_type'), AGENCY_TYPE_MAP), + 'designation_type': _decode(row.get('des_tp'), DESIGNATION_TYPE_MAP), + 'gap_status': str(row.get('gap_sts', '')).strip(), + 'public_access': _decode(pa_code, PUBLIC_ACCESS_MAP), + 'public_access_code': pa_code, + 'category': _decode(row.get('category'), CATEGORY_MAP), + 'acres': row.get('gis_acres'), + 'state': _decode(row.get('state_nm'), STATE_MAP), + }) + + return results + + +def format_summary(classifications): + """ + Format a human-readable summary from classification results. + + Returns the most specific unit name, or None if no results. + """ + if not classifications: + return None + # First result is smallest/most specific (ordered by acres ASC) + return classifications[0].get('unit_name') or None diff --git a/lib/nav_tools.py b/lib/nav_tools.py new file mode 100644 index 0000000..d4bb1f7 --- /dev/null +++ b/lib/nav_tools.py @@ -0,0 +1,168 @@ +"""Navigation tools: geocoding via Photon and routing via Valhalla.""" + +import math +import re +import requests + +from .utils import setup_logging + +logger = setup_logging('recon.nav_tools') + +PHOTON_URL = "http://localhost:2322" +VALHALLA_URL = "http://localhost:8002" + +# Regional bias for Photon searches (Idaho-centric for Matt's use case). +# Adjustable — Photon uses these to rank nearby results higher. +GEOCODE_BIAS_LAT = 42.5736 +GEOCODE_BIAS_LON = -114.6066 +GEOCODE_BIAS_ZOOM = 10 + +# Distance threshold (meters) for annotating Photon results with address +# book labels. 75m covers GPS jitter + geocoder imprecision. +ADDRESS_BOOK_ANNOTATION_RADIUS_M = 75 + +# Coordinate regex — handles comma-separated and space-separated forms. +_COORD_RE = re.compile( + r'^\s*(-?\d+\.\d+)\s*[,\s]\s*(-?\d+\.\d+)\s*$' +) + +VALID_MODES = {"auto", "pedestrian", "bicycle", "truck"} + + +def _parse_coords(text: str): + """Return (lat, lon) if text looks like coordinates with valid bounds, else None.""" + m = _COORD_RE.match(text.strip()) + if not m: + return None + lat, lon = float(m.group(1)), float(m.group(2)) + if -90 <= lat <= 90 and -180 <= lon <= 180: + return lat, lon + return None + + +def _haversine_m(lat1, lon1, lat2, lon2): + """Haversine distance in meters between two (lat, lon) points.""" + R = 6_371_000 # Earth radius in meters + rlat1, rlat2 = math.radians(lat1), math.radians(lat2) + dlat = math.radians(lat2 - lat1) + dlon = math.radians(lon2 - lon1) + a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 + return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) + + +def geocode(query: str, limit: int = 10, lat=None, lon=None, zoom=None): + """Delegate to the structured geocode module. See lib/geocode.py.""" + from . import geocode as geocode_mod + return geocode_mod.geocode(query, limit=limit, lat=lat, lon=lon, zoom=zoom) + + +def _geocode(query: str): + """Internal: returns (lat, lon, display_name) tuple for route().""" + result = geocode(query, limit=1) + results = result.get('results', []) + if not results: + raise ValueError(f"Could not find location: {query}") + top = results[0] + return top['lat'], top['lon'], top['name'] + + +def reverse_geocode(lat: float, lon: float) -> str: + """Reverse geocode coordinates via Photon. Returns formatted address string.""" + try: + resp = requests.get( + f"{PHOTON_URL}/reverse", + params={"lat": lat, "lon": lon, "limit": 1}, + timeout=10, + ) + resp.raise_for_status() + except requests.RequestException: + raise RuntimeError("Navigation service unavailable") + + data = resp.json() + features = data.get("features", []) + if not features: + return f"{lat}, {lon}" + + props = features[0]["properties"] + parts = [] + for key in ("name", "housenumber", "street", "city", "state", "country", "postcode"): + v = props.get(key) + if v: + parts.append(v) + return ", ".join(parts) if parts else f"{lat}, {lon}" + + +def route(origin: str, destination: str, mode: str = "auto") -> dict: + """ + Get a route between two locations. + + Args: + origin: Starting location — address, place name, or "lat,lon" + destination: Destination — address, place name, or "lat,lon" + mode: Travel mode — auto, pedestrian, bicycle, truck + + Returns: + dict with summary, maneuvers, origin/destination info, and raw shape + """ + if mode not in VALID_MODES: + mode = "auto" + + # Geocode both endpoints + orig_lat, orig_lon, orig_name = _geocode(origin) + dest_lat, dest_lon, dest_name = _geocode(destination) + + # Query Valhalla + valhalla_req = { + "locations": [ + {"lat": orig_lat, "lon": orig_lon}, + {"lat": dest_lat, "lon": dest_lon}, + ], + "costing": mode, + "directions_options": {"units": "miles"}, + } + + try: + resp = requests.post( + f"{VALHALLA_URL}/route", + json=valhalla_req, + timeout=30, + ) + except requests.RequestException: + raise RuntimeError("Navigation service unavailable") + + if resp.status_code != 200: + try: + err = resp.json() + msg = err.get("error", "Unknown routing error") + except Exception: + msg = f"Routing error (HTTP {resp.status_code})" + raise RuntimeError(f"No route found between locations: {msg}") + + data = resp.json() + trip = data["trip"] + summary = trip["summary"] + leg = trip["legs"][0] + + # Build maneuver list + maneuvers = [] + for m in leg["maneuvers"]: + streets = m.get("street_names", []) + maneuvers.append({ + "instruction": m["instruction"], + "distance_miles": round(m.get("length", 0), 2), + "street_name": streets[0] if streets else "", + "type": m.get("type", 0), + "verbal_succinct": m.get("verbal_succinct_transition_instruction", ""), + }) + + return { + "origin": {"name": orig_name, "lat": orig_lat, "lon": orig_lon}, + "destination": {"name": dest_name, "lat": dest_lat, "lon": dest_lon}, + "summary": { + "distance_miles": round(summary["length"], 1), + "time_minutes": round(summary["time"] / 60, 1), + "mode": mode, + }, + "maneuvers": maneuvers, + "shape": leg.get("shape", ""), + } diff --git a/lib/nav_tools_test.py b/lib/nav_tools_test.py new file mode 100644 index 0000000..b987293 --- /dev/null +++ b/lib/nav_tools_test.py @@ -0,0 +1,77 @@ +"""Tests for nav_tools — run against live Photon + Valhalla services.""" + +import sys +import json + +from nav_tools import route, reverse_geocode + + +def test_route_named(): + """route("Buhl Idaho", "Boise Idaho", "auto") returns maneuvers.""" + print("TEST 1: route('Buhl Idaho', 'Boise Idaho', 'auto')") + r = route("Buhl Idaho", "Boise Idaho", "auto") + assert r["summary"]["distance_miles"] > 50, f"Expected >50 mi, got {r['summary']['distance_miles']}" + assert r["summary"]["time_minutes"] > 60, f"Expected >60 min, got {r['summary']['time_minutes']}" + assert len(r["maneuvers"]) > 5, f"Expected >5 maneuvers, got {len(r['maneuvers'])}" + assert r["shape"], "Missing polyline shape" + print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min, {len(r['maneuvers'])} maneuvers") + print(f" Origin: {r['origin']['name']}") + print(f" Destination: {r['destination']['name']}") + print(f" First maneuver: {r['maneuvers'][0]['instruction']}") + + +def test_route_coords(): + """route with raw lat,lon coordinates.""" + print("\nTEST 2: route('42.5991,-114.7636', '43.615,-116.2023', 'auto')") + r = route("42.5991,-114.7636", "43.615,-116.2023", "auto") + assert r["summary"]["distance_miles"] > 100, f"Expected >100 mi, got {r['summary']['distance_miles']}" + assert len(r["maneuvers"]) > 3, f"Expected >3 maneuvers" + print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min") + + +def test_route_pedestrian(): + """route with pedestrian mode.""" + print("\nTEST 3: route('Buhl Idaho', 'Boise Idaho', 'pedestrian')") + r = route("Buhl Idaho", "Boise Idaho", "pedestrian") + assert r["summary"]["mode"] == "pedestrian" + assert r["summary"]["time_minutes"] > r["summary"]["distance_miles"], "Walking should take more min than miles" + print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min (pedestrian)") + + +def test_reverse_geocode(): + """reverse_geocode near Buhl, Idaho.""" + print("\nTEST 4: reverse_geocode(42.5991, -114.7636)") + result = reverse_geocode(42.5991, -114.7636) + assert "Buhl" in result or "Twin Falls" in result or "Idaho" in result, f"Expected Buhl/Idaho, got: {result}" + print(f" OK — {result}") + + +def test_route_bad_origin(): + """route with nonexistent place returns clean error.""" + print("\nTEST 5: route('nonexistent place xyz123abc', 'Boise Idaho')") + try: + r = route("nonexistent place xyz123abc", "Boise Idaho") + print(f" FAIL — expected error, got result: {r['summary']}") + return False + except ValueError as e: + print(f" OK — clean error: {e}") + except RuntimeError as e: + print(f" OK — runtime error: {e}") + + +if __name__ == "__main__": + passed = 0 + failed = 0 + tests = [test_route_named, test_route_coords, test_route_pedestrian, test_reverse_geocode, test_route_bad_origin] + + for test in tests: + try: + test() + passed += 1 + except Exception as e: + print(f" FAIL — {e}") + failed += 1 + + print(f"\n{'='*40}") + print(f"Results: {passed} passed, {failed} failed out of {len(tests)}") + sys.exit(1 if failed else 0) diff --git a/lib/netsyms_api.py b/lib/netsyms_api.py index dbae24e..4a0847f 100644 --- a/lib/netsyms_api.py +++ b/lib/netsyms_api.py @@ -1,18 +1,22 @@ """ -RECON Netsyms API — Flask Blueprint. +RECON Netsyms API + Geocode — Flask Blueprints. GET /api/netsyms/lookup?q=&country= GET /api/netsyms/health +GET /api/geocode?q=&limit= (Photon-first search with ranked results) """ from flask import Blueprint, request, jsonify from . import netsyms +from . import address_book +from . import nav_tools from .utils import setup_logging logger = setup_logging('recon.netsyms_api') netsyms_bp = Blueprint('netsyms', __name__) +geocode_bp = Blueprint('geocode', __name__) @netsyms_bp.route('/api/netsyms/lookup') @@ -29,3 +33,94 @@ def api_netsyms_lookup(): @netsyms_bp.route('/api/netsyms/health') def api_netsyms_health(): return jsonify(netsyms.health()) + + + +def _safe_float(val, lo, hi): + """Parse val as float; return None if missing, non-numeric, or out of [lo, hi].""" + if val is None: + return None + try: + f = float(val) + if lo <= f <= hi: + return f + except (ValueError, TypeError): + pass + return None + +@geocode_bp.route('/api/geocode') +def api_geocode(): + """ + Photon-first geocoding with ranked candidates. + + GET /api/geocode?q=&limit= + + Always returns 200 OK with: + {query, results: [{name, lat, lon, source, confidence, type, raw, ...}], count} + + - source: "address_book" | "coordinates" | "photon" + - confidence: "exact" | "high" | "medium" | "low" + - type: "nickname" | "coordinates" | "street_address" | "poi" | "locality" + - labeled_as: present when result is within 75m of an address book entry + - Empty results array is valid (no match). No 404s. + """ + q = request.args.get('q', '').strip() + limit = request.args.get('limit', '10') + try: + limit = max(1, min(int(limit), 20)) + except (ValueError, TypeError): + limit = 10 + + # Viewport bias parameters (optional) + lat = _safe_float(request.args.get("lat"), -90, 90) + lon = _safe_float(request.args.get("lon"), -180, 180) + zoom = _safe_float(request.args.get("zoom"), 0, 22) + + result = nav_tools.geocode(q, limit=limit, lat=lat, lon=lon, zoom=zoom) + return jsonify(result) + + +@geocode_bp.route('/api/reverse') +def api_reverse(): + """ + Reverse geocode coordinates via Photon. + + GET /api/reverse?lat=X&lon=Y + + Returns same shape as /api/geocode: + {query: "lat,lon", results: [{name, lat, lon, source, type, raw, ...}], count} + + Returns 200 OK with empty results on no match. 400 on invalid coords. + """ + try: + lat = float(request.args.get('lat', '')) + lon = float(request.args.get('lon', '')) + except (ValueError, TypeError): + return jsonify({'error': 'Missing or invalid lat/lon parameters'}), 400 + + if not (-90 <= lat <= 90) or not (-180 <= lon <= 180): + return jsonify({'error': 'Coordinates out of range'}), 400 + + query_str = f"{lat},{lon}" + + try: + import requests as http_requests + resp = http_requests.get( + "http://localhost:2322/reverse", + params={"lat": lat, "lon": lon, "limit": 1}, + timeout=10, + ) + resp.raise_for_status() + data = resp.json() + features = data.get("features", []) + except Exception: + logger.warning("Photon reverse geocode failed for %s", query_str) + return jsonify({'query': query_str, 'results': [], 'count': 0}) + + if not features: + return jsonify({'query': query_str, 'results': [], 'count': 0}) + + from .geocode import _parse_photon_features + results = _parse_photon_features(features, source='photon_reverse') + + return jsonify({'query': query_str, 'results': results, 'count': len(results)}) diff --git a/lib/osm_categories.py b/lib/osm_categories.py new file mode 100644 index 0000000..dd5217c --- /dev/null +++ b/lib/osm_categories.py @@ -0,0 +1,143 @@ +""" +Human-readable category names for OSM class/type pairs. + +Used by the place detail proxy to turn ("amenity", "cafe") into "Coffee shop". +Covers ~50 common categories; unmapped pairs fall back to title-cased class:type. +""" + +# Exact (class, type) → label +CATEGORY_MAP = { + # Amenity + ("amenity", "cafe"): "Coffee shop", + ("amenity", "restaurant"): "Restaurant", + ("amenity", "fast_food"): "Fast food restaurant", + ("amenity", "bar"): "Bar", + ("amenity", "pub"): "Pub", + ("amenity", "biergarten"): "Beer garden", + ("amenity", "ice_cream"): "Ice cream shop", + ("amenity", "fuel"): "Gas station", + ("amenity", "charging_station"): "EV charging station", + ("amenity", "parking"): "Parking", + ("amenity", "bank"): "Bank", + ("amenity", "atm"): "ATM", + ("amenity", "pharmacy"): "Pharmacy", + ("amenity", "hospital"): "Hospital", + ("amenity", "clinic"): "Clinic", + ("amenity", "dentist"): "Dentist", + ("amenity", "doctors"): "Doctor's office", + ("amenity", "veterinary"): "Veterinarian", + ("amenity", "school"): "School", + ("amenity", "university"): "University", + ("amenity", "college"): "College", + ("amenity", "library"): "Library", + ("amenity", "post_office"): "Post office", + ("amenity", "fire_station"): "Fire station", + ("amenity", "police"): "Police station", + ("amenity", "townhall"): "Town hall", + ("amenity", "place_of_worship"): "Place of worship", + ("amenity", "theatre"): "Theatre", + ("amenity", "cinema"): "Cinema", + ("amenity", "community_centre"): "Community center", + ("amenity", "toilets"): "Restrooms", + ("amenity", "drinking_water"): "Drinking water", + ("amenity", "shelter"): "Shelter", + ("amenity", "camping"): "Campground", + # Shop + ("shop", "supermarket"): "Supermarket", + ("shop", "convenience"): "Convenience store", + ("shop", "hardware"): "Hardware store", + ("shop", "clothes"): "Clothing store", + ("shop", "car_repair"): "Auto repair", + ("shop", "car"): "Car dealership", + ("shop", "bakery"): "Bakery", + ("shop", "butcher"): "Butcher", + # Leisure + ("leisure", "park"): "Park", + ("leisure", "playground"): "Playground", + ("leisure", "sports_centre"): "Sports center", + ("leisure", "swimming_pool"): "Swimming pool", + ("leisure", "golf_course"): "Golf course", + ("leisure", "nature_reserve"): "Nature reserve", + ("leisure", "campsite"): "Campsite", + # Tourism + ("tourism", "hotel"): "Hotel", + ("tourism", "motel"): "Motel", + ("tourism", "guest_house"): "Guest house", + ("tourism", "hostel"): "Hostel", + ("tourism", "camp_site"): "Campsite", + ("tourism", "viewpoint"): "Viewpoint", + ("tourism", "museum"): "Museum", + ("tourism", "information"): "Information", + ("tourism", "attraction"): "Tourist attraction", + ("tourism", "picnic_site"): "Picnic site", + # Natural + ("natural", "peak"): "Peak", + ("natural", "spring"): "Spring", + ("natural", "hot_spring"): "Hot spring", + ("natural", "lake"): "Lake", + ("natural", "water"): "Water body", + ("natural", "cliff"): "Cliff", + ("natural", "cave_entrance"): "Cave", + # Highway + ("highway", "bus_stop"): "Bus stop", + ("highway", "rest_area"): "Rest area", + # Boundary + ("boundary", "administrative"): "Administrative boundary", + ("boundary", "protected_area"): "Protected area", + ("boundary", "national_park"): "National park", + # Place + ("place", "city"): "City", + ("place", "town"): "Town", + ("place", "village"): "Village", + ("place", "hamlet"): "Hamlet", + ("place", "suburb"): "Suburb", + ("place", "neighbourhood"): "Neighborhood", + # Building + ("building", "yes"): "Building", + # Waterway + ("waterway", "river"): "River", + ("waterway", "stream"): "Stream", + ("waterway", "waterfall"): "Waterfall", + # Landuse + ("landuse", "cemetery"): "Cemetery", + ("landuse", "forest"): "Forest", + # Historic + ("historic", "monument"): "Monument", + ("historic", "memorial"): "Memorial", + ("historic", "ruins"): "Ruins", +} + +# Class-level wildcard fallbacks (when exact type isn't mapped) +CLASS_FALLBACKS = { + "shop": "Shop", + "amenity": "Amenity", + "leisure": "Leisure", + "tourism": "Tourism", + "natural": "Natural feature", + "historic": "Historic site", +} + + +def humanize_category(osm_class, osm_type): + """Return a human-readable category string for an OSM class/type pair.""" + if not osm_class or not osm_type: + return "Place" + + osm_class = osm_class.lower() + osm_type = osm_type.lower() + + # Exact match + label = CATEGORY_MAP.get((osm_class, osm_type)) + if label: + return label + + # Class-level wildcard with formatted type + prefix = CLASS_FALLBACKS.get(osm_class) + if prefix: + nice_type = osm_type.replace("_", " ").title() + return f"{prefix}: {nice_type}" if prefix != nice_type else prefix + + # Generic fallback + nice_class = osm_class.replace("_", " ").title() + nice_type = osm_type.replace("_", " ").title() + return f"{nice_class}: {nice_type}" diff --git a/lib/overture.py b/lib/overture.py new file mode 100644 index 0000000..fcbdd18 --- /dev/null +++ b/lib/overture.py @@ -0,0 +1,170 @@ +""" +Overture Maps enrichment layer. + +Provides lookup functions against the local PostgreSQL Overture Places database. +Two strategies: + 1. find_by_osm_id — exact match via OSM cross-reference index + 2. find_by_coords_and_name — spatial + fuzzy name fallback + +Connection pool is lazy-initialized on first call. If PostgreSQL is unreachable, +functions return None gracefully (feature degrades, doesn't crash). +""" +import json +import os + +import psycopg2 +import psycopg2.pool + +from .utils import setup_logging + +logger = setup_logging('recon.overture') + +_pool = None +_pool_failed = False + +# Map full OSM type names to single-letter codes used in Overture sources +OSM_TYPE_MAP = { + 'N': 'n', 'W': 'w', 'R': 'r', + 'node': 'n', 'way': 'w', 'relation': 'r', + 'n': 'n', 'w': 'w', 'r': 'r', +} + + +def _get_pool(): + """Lazy-init the connection pool. Returns None if Postgres is unreachable.""" + global _pool, _pool_failed + if _pool is not None: + return _pool + if _pool_failed: + return None + + try: + _pool = psycopg2.pool.SimpleConnectionPool( + minconn=1, + maxconn=3, + host=os.environ.get('OVERTURE_DB_HOST', 'localhost'), + port=int(os.environ.get('OVERTURE_DB_PORT', '5432')), + dbname=os.environ.get('OVERTURE_DB_NAME', 'overture'), + user=os.environ.get('OVERTURE_DB_USER', 'overture'), + password=os.environ.get('OVERTURE_DB_PASSWORD', ''), + connect_timeout=5, + ) + logger.info("Overture PostgreSQL connection pool initialized") + return _pool + except Exception as e: + _pool_failed = True + logger.warning(f"Overture PostgreSQL unavailable, enrichment disabled: {e}") + return None + + +def _query(sql, params): + """Execute a query and return the first row as a dict, or None.""" + pool = _get_pool() + if pool is None: + return None + + conn = None + try: + conn = pool.getconn() + with conn.cursor() as cur: + cur.execute(sql, params) + row = cur.fetchone() + if row is None: + return None + cols = [desc[0] for desc in cur.description] + return dict(zip(cols, row)) + except Exception as e: + logger.warning(f"Overture query error: {e}") + if conn: + try: + conn.rollback() + except Exception: + pass + return None + finally: + if conn: + try: + pool.putconn(conn) + except Exception: + pass + + +def _format_result(row, match_method): + """Convert a database row dict to the enrichment result shape.""" + if not row: + return None + + socials = row.get('socials') + if isinstance(socials, str): + try: + socials = json.loads(socials) + except (json.JSONDecodeError, TypeError): + socials = None + + return { + 'phone': row.get('phone'), + 'website': row.get('website'), + 'socials': socials, + 'brand_name': row.get('brand_name'), + 'brand_wikidata': row.get('brand_wikidata'), + 'basic_category': row.get('basic_category'), + 'confidence': row.get('confidence'), + 'gers_id': row.get('id'), + 'match_method': match_method, + } + + +def find_by_osm_id(osm_type, osm_id): + """ + Look up an Overture place by its OSM cross-reference. + + Args: + osm_type: OSM type — 'N', 'W', 'R', 'node', 'way', 'relation', or single letter + osm_id: OSM numeric ID + + Returns: + Enrichment dict or None + """ + type_letter = OSM_TYPE_MAP.get(osm_type) + if not type_letter: + return None + + row = _query( + """SELECT id, name, basic_category, confidence, + phone, website, socials, brand_name, brand_wikidata + FROM places + WHERE osm_type = %s AND osm_id = %s + LIMIT 1""", + (type_letter, int(osm_id)) + ) + return _format_result(row, 'osm_xref') + + +def find_by_coords_and_name(lat, lon, name, radius_m=100): + """ + Look up an Overture place by spatial proximity + fuzzy name match. + + Args: + lat: Latitude + lon: Longitude + name: Place name to fuzzy-match + radius_m: Search radius in meters (default 100) + + Returns: + Enrichment dict or None + """ + if not name or not lat or not lon: + return None + + row = _query( + """SELECT id, name, basic_category, confidence, + phone, website, socials, brand_name, brand_wikidata, + similarity(name, %s) AS sim + FROM places + WHERE ST_DWithin(geometry::geography, ST_MakePoint(%s, %s)::geography, %s) + AND similarity(name, %s) > 0.4 + ORDER BY sim DESC, ST_Distance(geometry::geography, ST_MakePoint(%s, %s)::geography) ASC + LIMIT 1""", + (name, lon, lat, radius_m, name, lon, lat) + ) + return _format_result(row, 'coord_name_fuzzy') diff --git a/lib/place_detail.py b/lib/place_detail.py new file mode 100644 index 0000000..e85ee54 --- /dev/null +++ b/lib/place_detail.py @@ -0,0 +1,817 @@ +""" +Place detail proxy — local Nominatim first, Overpass API fallback, SQLite cache. +Overture Maps enrichment layer fills sparse extratags (phone, website, brand). + +Provides get_place_detail(osm_type, osm_id) which returns a cleaned dict +matching the response shape for /api/place//. +""" +import json +import os +import sqlite3 +import time + +import requests as http_requests + +from .osm_categories import humanize_category +from .utils import setup_logging + +logger = setup_logging('recon.place_detail') + +NOMINATIM_URL = "http://localhost:8010/details.php" +OVERPASS_URL = "https://overpass-api.de/api/interpreter" +OVERPASS_UA = "Navi/1.0 (forge.echo6.co/matt/recon)" +VALID_OSM_TYPES = {"N", "W", "R"} + +_db_conn = None + + +# ── SQLite cache ──────────────────────────────────────────────────────── + +def _get_db(): + """Return a module-level SQLite connection (lazy init).""" + global _db_conn + if _db_conn is not None: + return _db_conn + + db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') + os.makedirs(db_dir, exist_ok=True) + db_path = os.path.join(db_dir, 'place_cache.db') + + _db_conn = sqlite3.connect(db_path, check_same_thread=False) + _db_conn.execute("PRAGMA journal_mode=WAL") + _db_conn.execute("PRAGMA synchronous=NORMAL") + _db_conn.execute(""" + CREATE TABLE IF NOT EXISTS place_cache ( + osm_type TEXT NOT NULL, + osm_id INTEGER NOT NULL, + data TEXT NOT NULL, + source TEXT NOT NULL, + cached_at INTEGER NOT NULL, + PRIMARY KEY (osm_type, osm_id) + ) + """) + _db_conn.commit() + logger.info(f"Place cache DB ready at {db_path}") + return _db_conn + + +def cache_get(osm_type, osm_id): + """Return cached place dict or None.""" + db = _get_db() + row = db.execute( + "SELECT data FROM place_cache WHERE osm_type=? AND osm_id=?", + (osm_type, osm_id) + ).fetchone() + if row: + try: + result = json.loads(row[0]) + result['source'] = 'cache' + return result + except (json.JSONDecodeError, TypeError): + pass + return None + + +def cache_put(osm_type, osm_id, data, source): + """Store a place detail result in the cache (preserves google columns).""" + db = _get_db() + now = int(time.time()) + db.execute(""" + INSERT INTO place_cache (osm_type, osm_id, data, source, cached_at) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(osm_type, osm_id) DO UPDATE SET + data = excluded.data, + source = excluded.source, + cached_at = excluded.cached_at + """, (osm_type, osm_id, json.dumps(data), source, now)) + db.commit() + + +# ── Overture enrichment ───────────────────────────────────────────────── + +def _enrich_with_overture(result, osm_type, osm_id): + """ + Attempt to enrich a place result with Overture Maps data. + Fills sparse extratags (phone, website, brand) without overwriting existing values. + Returns the (possibly enriched) result dict. + """ + try: + from .deployment_config import get_deployment_config + deploy_config = get_deployment_config() + features = deploy_config.get('features', {}) + if not features.get('has_overture_enrichment', False): + return result + except Exception: + return result + + try: + from .overture import find_by_osm_id, find_by_coords_and_name + except ImportError: + logger.debug("Overture module not available") + return result + + enrichment = None + match_method = None + + # Strategy 1: OSM cross-reference (exact) + enrichment = find_by_osm_id(osm_type, osm_id) + if enrichment: + match_method = 'osm_xref' + + # Strategy 2: Coordinate + name fuzzy (fallback) + if not enrichment and result.get('centroid') and result.get('name'): + centroid = result['centroid'] + if centroid.get('lat') and centroid.get('lon'): + enrichment = find_by_coords_and_name( + centroid['lat'], centroid['lon'], result['name'] + ) + if enrichment: + match_method = 'coord_name_fuzzy' + + if not enrichment: + return result + + # Fill sparse extratags (never overwrite existing non-null values) + extratags = result.get('extratags', {}) + fill_map = [ + ('phone', 'phone'), + ('website', 'website'), + ('brand', 'brand_name'), + ('brand:wikidata', 'brand_wikidata'), + ] + for osm_key, overture_key in fill_map: + if not extratags.get(osm_key) and enrichment.get(overture_key): + extratags[osm_key] = enrichment[overture_key] + result['extratags'] = extratags + + # Add source metadata + result['sources'] = { + 'primary': result.get('source', 'unknown'), + 'enrichment': 'overture', + 'overture_match_method': match_method, + 'overture_gers_id': enrichment.get('gers_id'), + 'overture_confidence': enrichment.get('confidence'), + 'overture_basic_category': enrichment.get('basic_category'), + } + + logger.debug(f"Overture enrichment for {osm_type}/{osm_id}: {match_method}") + return result + + + +# ── Google Places enrichment (tertiary, gap-fill only) ────────────── + +# Business POI classes eligible for Google enrichment +_BUSINESS_CLASSES = {'amenity', 'shop', 'tourism', 'leisure', 'office', 'craft'} + +# Fields Google can fill +_GOOGLE_GAP_FIELDS = ('opening_hours', 'phone', 'website') + + +def _enrich_with_google(result, osm_type, osm_id): + """ + Tertiary enrichment via Google Places (New) API. + Only fires for business-type POIs when opening_hours, phone, or website + are still missing after OSM + Overture enrichment. + Fills only empty fields — never overwrites existing values. + """ + # Check feature flag + try: + from .deployment_config import get_deployment_config + deploy_config = get_deployment_config() + features = deploy_config.get('features', {}) + if not features.get('has_google_places_enrichment', False): + return result + except Exception: + return result + + # Only enrich business-type POIs + poi_class = result.get('class', '') + if poi_class not in _BUSINESS_CLASSES: + return result + + # Check if any gap fields are missing + extratags = result.get('extratags', {}) + gaps = [f for f in _GOOGLE_GAP_FIELDS if not extratags.get(f)] + if not gaps: + logger.debug(f"google_places: skip {osm_type}/{osm_id} — no gaps") + return result + + try: + from . import google_places + except ImportError: + logger.debug("google_places module not available") + return result + + # Check Google cache first + cached_pid, cached_data = google_places.cache_get_google(osm_type, osm_id) + if cached_pid and cached_data: + _apply_google_data(result, cached_data, gaps) + result.setdefault('sources', {})['google_places'] = { + 'place_id': cached_pid, + 'source': 'cache', + } + logger.debug(f"google_places: cache hit for {osm_type}/{osm_id}") + return result + + # Skip if already looked up and found nothing (cached_pid is None) + if cached_pid is not None: + return result + + # Skip new Google API calls for guest users (cached data already returned above) + from .auth import get_user_id + if not get_user_id(): + logger.debug(f"google_places: skip API call for {osm_type}/{osm_id} — guest user") + return result + + # Daily cap check + if not google_places.check_daily_cap(): + return result + + # Search for the place + name = result.get('name', '') + centroid = result.get('centroid', {}) + lat = centroid.get('lat') + lon = centroid.get('lon') + if not name or not lat or not lon: + return result + + place_id = google_places.search_place(name, lat, lon) + if not place_id: + # Cache the miss to avoid repeated lookups + google_places.cache_put_google(osm_type, osm_id, '__miss__', None) + return result + + # Get details + details = google_places.get_place_details(place_id) + if not details: + google_places.cache_put_google(osm_type, osm_id, place_id, None) + return result + + # Cache the result + google_places.cache_put_google(osm_type, osm_id, place_id, details) + + # Apply to result + _apply_google_data(result, details, gaps) + result.setdefault('sources', {})['google_places'] = { + 'place_id': place_id, + 'source': 'api', + 'daily_count': google_places.get_daily_count(), + } + + return result + + +def _apply_google_data(result, google_data, gaps): + """Apply Google Places data to fill gap fields only.""" + extratags = result.get('extratags', {}) + if 'opening_hours' in gaps: + osm_hours = google_data.get('opening_hours') + if osm_hours: + extratags['opening_hours'] = osm_hours + elif google_data.get('opening_hours_raw'): + extratags['opening_hours_raw'] = google_data['opening_hours_raw'] + if 'phone' in gaps and google_data.get('phone_number'): + extratags['phone'] = google_data['phone_number'] + if 'website' in gaps and google_data.get('website'): + extratags['website'] = google_data['website'] + result['extratags'] = extratags + + + + +# ── Wiki link rewriting ───────────────────────────────────────────────── + +# Extratag keys that may contain wiki references +_WIKI_TAGS = ('wikipedia', 'wikidata', 'wikivoyage', 'appropedia') + + +def _enrich_wiki_links(result): + """ + Rewrite wiki-related extratags to local Kiwix URLs where available. + Falls back to public URLs. Only runs when has_wiki_rewriting is enabled. + Returns the (possibly enriched) result dict. + """ + try: + from .deployment_config import get_deployment_config + deploy_config = get_deployment_config() + features = deploy_config.get('features', {}) + if not features.get('has_wiki_rewriting', False): + return result + except Exception: + return result + + try: + from .wiki_rewrite import rewrite_wiki_link + except ImportError: + logger.debug("wiki_rewrite module not available") + return result + + extratags = result.get('extratags', {}) + if not extratags: + return result + + rewrites = {} + for tag in _WIKI_TAGS: + value = extratags.get(tag) + if not value: + continue + url, status = rewrite_wiki_link(tag, value) + if status != 'original': + extratags[tag] = url + rewrites[tag] = status + + if rewrites: + result['extratags'] = extratags + result.setdefault('sources', {})['wiki_rewrites'] = rewrites + logger.debug(f"Wiki rewrites for {result.get('osm_type')}/{result.get('osm_id')}: {rewrites}") + + return result + +# ── Nominatim parsing ─────────────────────────────────────────────────── + +# Nominatim address array uses rank_address to indicate what each entry is. +# We map rank ranges to our flat address fields. +RANK_TO_FIELD = { + 4: 'country', + 5: 'postcode', + 6: 'state', # rank 6 = county in US, but we try name matching + 8: 'state', + 12: 'county', + 16: 'city', + 20: 'neighbourhood', + 22: 'neighbourhood', + 26: 'road', + 28: 'house_number', +} + + +def _parse_nominatim_address(address_array, country_code=None): + """Parse Nominatim's ranked address array into a flat address dict.""" + addr = { + 'house_number': None, + 'road': None, + 'neighbourhood': None, + 'city': None, + 'county': None, + 'state': None, + 'postcode': None, + 'country': None, + 'country_code': country_code, + } + + if not address_array: + return addr + + for entry in address_array: + if not entry.get('isaddress', False): + continue + + name = entry.get('localname', '') + rank = entry.get('rank_address', 0) + etype = entry.get('type', '') + eclass = entry.get('class', '') + + # Explicit type-based assignments (more reliable than rank alone) + if etype == 'country' and eclass == 'place': + addr['country'] = name + elif etype == 'state' or (eclass == 'boundary' and etype == 'administrative' and rank == 8): + if not addr['state']: + addr['state'] = name + elif etype == 'county' or (eclass == 'boundary' and etype == 'administrative' and rank in (10, 12)): + if not addr['county']: + addr['county'] = name + elif etype in ('city', 'town', 'village', 'hamlet') and eclass == 'place': + if not addr['city']: + addr['city'] = name + elif eclass == 'boundary' and etype == 'administrative' and rank == 16: + # City-level admin boundary (common in US) + if not addr['city']: + addr['city'] = name + elif etype == 'postcode': + addr['postcode'] = name + elif eclass == 'highway' or rank == 26: + if not addr['road']: + addr['road'] = name + elif etype == 'house_number' or rank == 28: + addr['house_number'] = name + elif rank in (20, 22) and not addr['neighbourhood']: + addr['neighbourhood'] = name + + # Remove county from output (not in spec) + addr.pop('county', None) + + return addr + + +def _parse_nominatim(data): + """Parse a Nominatim /details response into our canonical shape.""" + osm_type = data.get('osm_type', '') + osm_id = data.get('osm_id', 0) + osm_class = data.get('category', '') + osm_type_tag = data.get('type', '') + + # Centroid + centroid_geom = data.get('centroid', {}) + coords = centroid_geom.get('coordinates', [0, 0]) + centroid = {'lat': coords[1], 'lon': coords[0]} if len(coords) >= 2 else {'lat': 0, 'lon': 0} + + # Names + names = data.get('names', {}) + display_name = data.get('localname', '') or names.get('name', '') + + # Address + address = _parse_nominatim_address( + data.get('address', []), + country_code=data.get('country_code') + ) + + # Use calculated_postcode if address parse didn't find one + if not address.get('postcode') and data.get('calculated_postcode'): + address['postcode'] = data['calculated_postcode'] + + # Extratags + raw_extra = data.get('extratags', {}) + extratags = { + 'opening_hours': raw_extra.get('opening_hours'), + 'phone': raw_extra.get('phone') or raw_extra.get('contact:phone'), + 'website': raw_extra.get('website') or raw_extra.get('contact:website') or raw_extra.get('url'), + 'email': raw_extra.get('email') or raw_extra.get('contact:email'), + 'wikipedia': raw_extra.get('wikipedia'), + 'wikidata': raw_extra.get('wikidata'), + 'cuisine': raw_extra.get('cuisine'), + 'operator': raw_extra.get('operator'), + 'wheelchair': raw_extra.get('wheelchair'), + 'fee': raw_extra.get('fee'), + 'takeaway': raw_extra.get('takeaway'), + } + + # Category: use extratags.place for boundaries (e.g. "city"), else class/type + effective_class = osm_class + effective_type = osm_type_tag + if osm_class == 'boundary' and osm_type_tag == 'administrative': + place_tag = raw_extra.get('place') or raw_extra.get('linked_place') + if place_tag: + effective_class = 'place' + effective_type = place_tag + + category = humanize_category(effective_class, effective_type) + + # Filter names: only include extra name tags, not the bare "name" + extra_names = {k: v for k, v in names.items() if k != 'name'} if names else {} + + # Boundary geometry (polygon/multipolygon from Nominatim) + boundary = None + geom = data.get('geometry') + if geom and geom.get('type') in ('Polygon', 'MultiPolygon'): + boundary = geom + + return { + 'osm_type': osm_type, + 'osm_id': osm_id, + 'name': display_name, + 'category': category, + 'class': osm_class, + 'type': osm_type_tag, + 'address': address, + 'centroid': centroid, + 'extratags': extratags, + 'names': extra_names if extra_names else None, + 'source': 'nominatim_local', + 'boundary': boundary, + } + + +# ── Overpass parsing ──────────────────────────────────────────────────── + +OVERPASS_TYPE_MAP = {'N': 'node', 'W': 'way', 'R': 'relation'} + + +def _build_overpass_query(osm_type, osm_id): + """Build an Overpass QL query for a single element.""" + elem = OVERPASS_TYPE_MAP.get(osm_type) + if not elem: + return None + return f"[out:json][timeout:10];{elem}({osm_id});out tags center;" + + +def _parse_overpass(data, osm_type, osm_id): + """Parse an Overpass API response into our canonical shape.""" + elements = data.get('elements', []) + if not elements: + return None + + elem = elements[0] + tags = elem.get('tags', {}) + + # Centroid: Overpass returns lat/lon for nodes, center for ways/relations + lat = elem.get('lat') or (elem.get('center', {}).get('lat')) + lon = elem.get('lon') or (elem.get('center', {}).get('lon')) + centroid = {'lat': lat, 'lon': lon} if lat and lon else {'lat': 0, 'lon': 0} + + # Determine class/type from tags — Overpass doesn't have a canonical class field + # Use the first recognized class tag + osm_class = '' + osm_type_tag = '' + for cls in ('amenity', 'shop', 'leisure', 'tourism', 'natural', 'highway', + 'boundary', 'place', 'building', 'waterway', 'landuse', 'historic'): + if cls in tags: + osm_class = cls + osm_type_tag = tags[cls] + break + + category = humanize_category(osm_class, osm_type_tag) + + # Address from addr:* tags + address = { + 'house_number': tags.get('addr:housenumber'), + 'road': tags.get('addr:street'), + 'neighbourhood': tags.get('addr:suburb') or tags.get('addr:neighbourhood'), + 'city': tags.get('addr:city'), + 'state': tags.get('addr:state'), + 'postcode': tags.get('addr:postcode'), + 'country': tags.get('addr:country'), + 'country_code': tags.get('addr:country_code', + tags.get('addr:country', '')).lower()[:2] or None, + } + + # Extratags + extratags = { + 'opening_hours': tags.get('opening_hours'), + 'phone': tags.get('phone') or tags.get('contact:phone'), + 'website': tags.get('website') or tags.get('contact:website') or tags.get('url'), + 'email': tags.get('email') or tags.get('contact:email'), + 'wikipedia': tags.get('wikipedia'), + 'wikidata': tags.get('wikidata'), + 'cuisine': tags.get('cuisine'), + 'operator': tags.get('operator'), + 'wheelchair': tags.get('wheelchair'), + 'fee': tags.get('fee'), + 'takeaway': tags.get('takeaway'), + } + + # Names + name = tags.get('name', '') + extra_names = {} + for k, v in tags.items(): + if k.startswith('name:') or k in ('alt_name', 'old_name', 'short_name', 'official_name'): + extra_names[k] = v + + return { + 'osm_type': osm_type, + 'osm_id': osm_id, + 'name': name, + 'category': category, + 'class': osm_class, + 'type': osm_type_tag, + 'address': address, + 'centroid': centroid, + 'extratags': extratags, + 'names': extra_names if extra_names else None, + 'source': 'overpass', + } + + +# ── Public API ────────────────────────────────────────────────────────── + +def get_place_detail(osm_type, osm_id): + """ + Fetch place details for an OSM element. + + Returns (dict, status_code): + - (data, 200) on success + - (error_dict, 404) if not found in any source + - (error_dict, 502) if both sources error + """ + osm_type = osm_type.upper() + if osm_type not in VALID_OSM_TYPES: + return {'error': f'Invalid osm_type: {osm_type}. Must be N, W, or R.'}, 400 + + if osm_id <= 0: + return {'error': 'osm_id must be a positive integer'}, 400 + + # 1. Check cache + cached = cache_get(osm_type, osm_id) + if cached: + logger.debug(f"Cache hit: {osm_type}/{osm_id}") + return cached, 200 + + # 2. Try local Nominatim first + nominatim_result = None + nominatim_error = None + try: + resp = http_requests.get(NOMINATIM_URL, params={ + 'osmtype': osm_type, + 'osmid': osm_id, + 'format': 'json', + 'addressdetails': 1, + 'hierarchy': 0, + 'keywords': 0, + 'polygon_geojson': 1, + }, timeout=5) + + if resp.status_code == 200: + data = resp.json() + # Nominatim returns a result even for IDs not in its DB, + # but they'll have empty/minimal data. Check for osm_id match. + if data.get('osm_id') == osm_id: + nominatim_result = _parse_nominatim(data) + logger.debug(f"Nominatim hit: {osm_type}/{osm_id}") + except Exception as e: + nominatim_error = str(e) + logger.warning(f"Nominatim error for {osm_type}/{osm_id}: {e}") + + if nominatim_result: + nominatim_result = _enrich_with_overture(nominatim_result, osm_type, osm_id) + nominatim_result = _enrich_with_google(nominatim_result, osm_type, osm_id) + nominatim_result = _enrich_wiki_links(nominatim_result) + cache_put(osm_type, osm_id, nominatim_result, 'nominatim_local') + return nominatim_result, 200 + + # 3. Fallback to Overpass + overpass_result = None + overpass_error = None + try: + query = _build_overpass_query(osm_type, osm_id) + if query: + resp = http_requests.post( + OVERPASS_URL, + data={'data': query}, + headers={'User-Agent': OVERPASS_UA}, + timeout=10, + ) + if resp.status_code == 200: + data = resp.json() + overpass_result = _parse_overpass(data, osm_type, osm_id) + if overpass_result: + logger.debug(f"Overpass hit: {osm_type}/{osm_id}") + elif resp.status_code == 429: + overpass_error = "Overpass rate limited" + logger.warning(f"Overpass 429 for {osm_type}/{osm_id}") + else: + overpass_error = f"Overpass HTTP {resp.status_code}" + except Exception as e: + overpass_error = str(e) + logger.warning(f"Overpass error for {osm_type}/{osm_id}: {e}") + + if overpass_result: + overpass_result = _enrich_with_overture(overpass_result, osm_type, osm_id) + overpass_result = _enrich_with_google(overpass_result, osm_type, osm_id) + overpass_result = _enrich_wiki_links(overpass_result) + cache_put(osm_type, osm_id, overpass_result, 'overpass') + return overpass_result, 200 + + # 4. Both failed + if nominatim_error and overpass_error: + logger.error(f"Both sources failed for {osm_type}/{osm_id}: " + f"Nominatim={nominatim_error}, Overpass={overpass_error}") + return {'error': 'Both data sources unavailable'}, 502 + + # Not found in either source (no errors, just empty results) + return {'error': f'{osm_type}/{osm_id} not found'}, 404 + + +# ── Wikidata lookup ───────────────────────────────────────────────────── + +WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php" + +def get_place_by_wikidata(wikidata_id): + """ + Fetch place details from Wikidata entity. + + Returns (dict, status_code): + - (data, 200) on success + - (error_dict, 404) if entity not found + - (error_dict, 400) if invalid ID format + - (error_dict, 502) on API error + """ + # Validate wikidata ID format (Q followed by digits) + wikidata_id = wikidata_id.upper().strip() + if not wikidata_id.startswith("Q") or not wikidata_id[1:].isdigit(): + return {"error": f"Invalid wikidata ID: {wikidata_id}. Must be Q followed by digits."}, 400 + + try: + resp = http_requests.get(WIKIDATA_API_URL, params={ + "action": "wbgetentities", + "ids": wikidata_id, + "format": "json", + "languages": "en", + "props": "labels|descriptions|claims|sitelinks", + }, timeout=10, headers={"User-Agent": "Navi/1.0 (forge.echo6.co/matt/recon)"}) + + if resp.status_code != 200: + logger.warning(f"Wikidata API error for {wikidata_id}: HTTP {resp.status_code}") + return {"error": "Wikidata API error"}, 502 + + data = resp.json() + entities = data.get("entities", {}) + entity = entities.get(wikidata_id) + + if not entity or entity.get("missing"): + return {"error": f"Wikidata entity {wikidata_id} not found"}, 404 + + # Extract basic info + labels = entity.get("labels", {}) + descriptions = entity.get("descriptions", {}) + claims = entity.get("claims", {}) + + name = labels.get("en", {}).get("value", wikidata_id) + description = descriptions.get("en", {}).get("value", "") + + # Extract coordinates from P625 (coordinate location) + lat, lon = None, None + if "P625" in claims: + coord_claim = claims["P625"] + if coord_claim and coord_claim[0].get("mainsnak", {}).get("datavalue"): + coord_val = coord_claim[0]["mainsnak"]["datavalue"]["value"] + lat = coord_val.get("latitude") + lon = coord_val.get("longitude") + + # Extract population from P1082 + population = None + if "P1082" in claims: + pop_claims = claims["P1082"] + if pop_claims: + # Get the most recent population value + for claim in pop_claims: + if claim.get("mainsnak", {}).get("datavalue"): + try: + population = int(claim["mainsnak"]["datavalue"]["value"]["amount"].lstrip("+")) + break + except (KeyError, ValueError): + pass + + # Extract country from P17 + country = None + if "P17" in claims: + country_claims = claims["P17"] + if country_claims and country_claims[0].get("mainsnak", {}).get("datavalue"): + country_id = country_claims[0]["mainsnak"]["datavalue"]["value"]["id"] + # Could resolve this to a name, but for now just store the ID + + # Extract instance of (P31) for type classification + instance_of = [] + if "P31" in claims: + for claim in claims["P31"]: + if claim.get("mainsnak", {}).get("datavalue"): + instance_of.append(claim["mainsnak"]["datavalue"]["value"]["id"]) + + # Extract OSM relation ID if available (P402) + osm_relation_id = None + if "P402" in claims: + osm_claims = claims["P402"] + if osm_claims and osm_claims[0].get("mainsnak", {}).get("datavalue"): + osm_relation_id = osm_claims[0]["mainsnak"]["datavalue"]["value"] + + # Extract Wikipedia sitelink + sitelinks = entity.get("sitelinks", {}) + wikipedia = None + if "enwiki" in sitelinks: + wiki_title = sitelinks["enwiki"].get("title", "") + if wiki_title: + wikipedia = f"en:{wiki_title}" + + result = { + "wikidata_id": wikidata_id, + "name": name, + "description": description, + "centroid": {"lat": lat, "lon": lon} if lat and lon else None, + "population": population, + "instance_of": instance_of, + "osm_relation_id": osm_relation_id, + "source": "wikidata", + "extratags": { + "wikidata": wikidata_id, + }, + } + + if wikipedia: + result["extratags"]["wikipedia"] = wikipedia + + # Fetch boundary polygon from Nominatim if we have an OSM relation ID + boundary = None + if osm_relation_id: + try: + nom_resp = http_requests.get(NOMINATIM_URL, params={ + 'osmtype': 'R', + 'osmid': osm_relation_id, + 'format': 'json', + 'polygon_geojson': 1, + }, timeout=5) + if nom_resp.status_code == 200: + nom_data = nom_resp.json() + geom = nom_data.get('geometry') + if geom and geom.get('type') in ('Polygon', 'MultiPolygon'): + boundary = geom + logger.debug(f"Wikidata boundary hit for {wikidata_id}") + except Exception as e: + logger.debug(f"Wikidata boundary fetch failed: {e}") + + result["boundary"] = boundary + + logger.debug(f"Wikidata hit: {wikidata_id} -> {name}") + return result, 200 + + except Exception as e: + logger.warning(f"Wikidata error for {wikidata_id}: {e}") + return {"error": "Wikidata lookup failed"}, 502 diff --git a/lib/wiki_rewrite.py b/lib/wiki_rewrite.py new file mode 100644 index 0000000..d884635 --- /dev/null +++ b/lib/wiki_rewrite.py @@ -0,0 +1,324 @@ +""" +Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia +links to local Kiwix URLs where the article exists in a loaded ZIM. + +Falls back silently to public URLs when article is unavailable locally. +Caches positive results only in place_cache.db. + +Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed +hourly to pick up newly loaded ZIMs without a restart. + +Operations note: + - After loading a new ZIM, either restart RECON (forces fresh catalog + fetch) or wait up to 1 hour for automatic refresh. + - To invalidate the wiki cache (e.g. after ZIM update): + sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;" +""" +import os +import re +import sqlite3 +import time +import xml.etree.ElementTree as ET +from urllib.parse import unquote, quote + +import requests as http_requests + +from .utils import setup_logging + +logger = setup_logging('recon.wiki_rewrite') + +# ── Configuration ─────────────────────────────────────────────────────── + +KIWIX_BASE = "http://localhost:8430" +KIWIX_PUBLIC_BASE = "https://wiki.echo6.co" +KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries" +HEAD_TIMEOUT = 1.5 # seconds +CATALOG_REFRESH_INTERVAL = 3600 # 1 hour + +# OPDS Atom namespace +_ATOM_NS = "http://www.w3.org/2005/Atom" + +# ── ZIM catalog map ───────────────────────────────────────────────────── + +_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02' +_zim_map_ts = 0.0 # last refresh timestamp + +# Prefix-to-source-type mapping (order matters: longest prefix first) +_ZIM_PREFIX_MAP = [ + ('wikipedia_en_all', 'wikipedia'), + ('appropedia_en_all', 'appropedia'), + ('wikivoyage_en', 'wikivoyage'), + ('wikidata_en', 'wikidata'), +] + + +def _discover_zims(): + """Parse Kiwix OPDS Atom catalog to map source types to content paths.""" + global _zim_map, _zim_map_ts + + try: + resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5) + if resp.status_code != 200: + logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}") + return + + root = ET.fromstring(resp.content) + new_map = {} + + for entry in root.findall(f"{{{_ATOM_NS}}}entry"): + name_el = entry.find(f"{{{_ATOM_NS}}}name") + if name_el is None: + continue + book_name = name_el.text or "" + + # + content_path = None + for link in entry.findall(f"{{{_ATOM_NS}}}link"): + if link.get("type") == "text/html": + href = link.get("href", "") + if href.startswith("/content/"): + content_path = href[len("/content/"):] + break + + if not content_path: + continue + + # Match book name against known prefixes + for prefix, source_type in _ZIM_PREFIX_MAP: + if book_name.startswith(prefix): + new_map[source_type] = content_path + break + + _zim_map = new_map + _zim_map_ts = time.time() + logger.info(f"ZIM catalog refreshed: {new_map}") + + except Exception as e: + logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}") + + +def _ensure_zim_map(): + """Lazy-load and refresh ZIM map if stale.""" + if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL: + _discover_zims() + + +# ── Database (wiki_cache in place_cache.db) ───────────────────────────── + +_db_conn = None + + +def _get_db(): + """Return a module-level SQLite connection to place_cache.db (lazy init).""" + global _db_conn + if _db_conn is not None: + return _db_conn + + db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') + os.makedirs(db_dir, exist_ok=True) + db_path = os.path.join(db_dir, 'place_cache.db') + + _db_conn = sqlite3.connect(db_path, check_same_thread=False) + _db_conn.execute("PRAGMA journal_mode=WAL") + _db_conn.execute("PRAGMA synchronous=NORMAL") + _db_conn.execute(""" + CREATE TABLE IF NOT EXISTS wiki_cache ( + source_type TEXT NOT NULL, + article_id TEXT NOT NULL, + kiwix_url TEXT NOT NULL, + cached_at INTEGER NOT NULL, + PRIMARY KEY (source_type, article_id) + ) + """) + _db_conn.commit() + logger.info(f"Wiki cache table ready in {db_path}") + return _db_conn + + +# ── URL classification ────────────────────────────────────────────────── + +# Patterns for OSM wikipedia/wikidata tag values +_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title" +_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)') +_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$') +_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)') +_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)') +_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)') + + +def _normalize_article_id(article_id): + """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores.""" + return article_id.replace(' ', '_') + + +def classify_wiki_link(tag_name, value): + """ + Classify an OSM extratag value into (source_type, article_id) or None. + + tag_name: the extratags key ('wikipedia', 'wikidata', etc.) + value: the raw tag value from OSM + + Article IDs are normalized to MediaWiki convention (spaces → underscores). + """ + if not value or not isinstance(value, str): + return None + + value = value.strip() + + if tag_name == 'wikidata': + m = _WIKIDATA_TAG_RE.match(value) + if m: + return ('wikidata', m.group(1)) + m = _WIKIDATA_URL_RE.match(value) + if m: + return ('wikidata', m.group(1)) + return None + + if tag_name == 'wikipedia': + # URL form: https://en.wikipedia.org/wiki/Title + m = _WIKI_URL_RE.match(value) + if m: + return ('wikipedia', _normalize_article_id(unquote(m.group(1)))) + # Tag form: "en:Title" or "Title" + m = _WIKI_TAG_RE.match(value) + if m: + return ('wikipedia', _normalize_article_id(m.group(1))) + return None + + if tag_name == 'wikivoyage': + m = _WIKIVOYAGE_URL_RE.match(value) + if m: + return ('wikivoyage', _normalize_article_id(unquote(m.group(1)))) + # Plain tag: "en:Title" or "Title" + m = _WIKI_TAG_RE.match(value) + if m: + return ('wikivoyage', _normalize_article_id(m.group(1))) + return None + + if tag_name == 'appropedia': + m = _APPROPEDIA_URL_RE.match(value) + if m: + return ('appropedia', _normalize_article_id(unquote(m.group(1)))) + return ('appropedia', _normalize_article_id(value)) + + return None + + +# ── URL builders ──────────────────────────────────────────────────────── + +def build_kiwix_url(source_type, article_id): + """Build a public Kiwix URL. Returns None if source_type not in ZIM map.""" + _ensure_zim_map() + content_path = _zim_map.get(source_type) + if not content_path: + return None + return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" + + +_PUBLIC_URL_TEMPLATES = { + 'wikipedia': "https://en.wikipedia.org/wiki/{id}", + 'wikidata': "https://www.wikidata.org/wiki/{id}", + 'wikivoyage': "https://en.wikivoyage.org/wiki/{id}", + 'appropedia': "https://www.appropedia.org/wiki/{id}", +} + + +def build_public_url(source_type, article_id): + """Build the canonical public URL for a wiki article.""" + tmpl = _PUBLIC_URL_TEMPLATES.get(source_type) + if not tmpl: + return None + return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;=')) + + +# ── Kiwix availability check ─────────────────────────────────────────── + +def check_kiwix_has_article(source_type, article_id): + """ + Check if an article exists in local Kiwix. + + Returns (bool, url): + - (True, kiwix_public_url) if article exists locally + - (False, None) if not found or Kiwix unavailable + + Only positive results are cached. + """ + # Check cache first + db = _get_db() + row = db.execute( + "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?", + (source_type, article_id) + ).fetchone() + if row: + return (True, row[0]) + + # Build local HEAD URL + _ensure_zim_map() + content_path = _zim_map.get(source_type) + if not content_path: + return (False, None) + + head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" + + try: + resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True) + if resp.status_code == 200: + kiwix_url = build_kiwix_url(source_type, article_id) + # Cache positive result + now = int(time.time()) + db.execute(""" + INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at) + VALUES (?, ?, ?, ?) + """, (source_type, article_id, kiwix_url, now)) + db.commit() + return (True, kiwix_url) + else: + return (False, None) + except Exception as e: + logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}") + return (False, None) + + +# ── Primary entry point ──────────────────────────────────────────────── + +def rewrite_wiki_link(tag_name, value): + """ + Rewrite an OSM wiki tag value to a local Kiwix URL if available. + + Returns (url, 'local'|'public') or (None, None) if unrecognized. + """ + classified = classify_wiki_link(tag_name, value) + if not classified: + return (value, 'original') + + source_type, article_id = classified + + # Try local Kiwix + found, kiwix_url = check_kiwix_has_article(source_type, article_id) + if found and kiwix_url: + return (kiwix_url, 'local') + + # Fall back to public URL + public_url = build_public_url(source_type, article_id) + if public_url: + return (public_url, 'public') + + return (value, 'original') + + +# ── Discovery stubs (disabled, for future activation) ─────────────────── + +def discover_wikivoyage_article(name, category, lat, lon): + """ + Discover a related Wikivoyage article for a place. + Enabled by has_wiki_discovery. Currently returns None. + """ + return None + + +def discover_appropedia_article(name, category): + """ + Discover a related Appropedia article for a place. + Enabled by has_wiki_discovery. Currently returns None. + """ + return None diff --git a/requirements.txt b/requirements.txt index 1da21bc..f643cd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ anyio==4.12.1 babel==2.18.0 beautifulsoup4==4.14.3 blinker==1.9.0 -cachetools==7.1.3 certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 diff --git a/scripts/overture_import.py b/scripts/overture_import.py new file mode 100644 index 0000000..0b6ba67 --- /dev/null +++ b/scripts/overture_import.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +"""Overture Maps Places → PostgreSQL import script (v2). + +Downloads Overture Places Parquet from S3 via DuckDB (public bucket, no credentials), +filters to North America bounding box, and inserts into local PostgreSQL with PostGIS. + +Usage: + cd /opt/recon && venv/bin/python scripts/overture_import.py + +Re-runnable (idempotent via UPSERT). +""" + +import json +import logging +import os +import re +import sys +import time + +import duckdb +import psycopg2 +import psycopg2.extras + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s', + datefmt='%H:%M:%S' +) +log = logging.getLogger('overture_import') + +# --- Config --- +OVERTURE_RELEASE = '2026-04-15.0' +S3_PATH = f's3://overturemaps-us-west-2/release/{OVERTURE_RELEASE}/theme=places/type=place/*' + +# North America bounding box (generous — includes Hawaii, Puerto Rico, Canada) +BBOX = { + 'xmin': -170.0, + 'xmax': -50.0, + 'ymin': 15.0, + 'ymax': 85.0, +} + +BATCH_SIZE = 50_000 +OSM_RECORD_RE = re.compile(r'^([nwr])(\d+)@\d+$') + +DB_CONFIG = { + 'host': os.environ.get('OVERTURE_DB_HOST', 'localhost'), + 'port': int(os.environ.get('OVERTURE_DB_PORT', '5432')), + 'dbname': os.environ.get('OVERTURE_DB_NAME', 'overture'), + 'user': os.environ.get('OVERTURE_DB_USER', 'overture'), + 'password': os.environ.get('OVERTURE_DB_PASSWORD', ''), +} + + +def create_table(conn): + """Create places table and indexes if they don't exist.""" + with conn.cursor() as cur: + cur.execute(""" + CREATE TABLE IF NOT EXISTS places ( + id TEXT PRIMARY KEY, + geometry GEOMETRY(Point, 4326), + name TEXT, + basic_category TEXT, + confidence REAL, + phone TEXT, + website TEXT, + socials JSONB, + brand_name TEXT, + brand_wikidata TEXT, + osm_type CHAR(1), + osm_id BIGINT, + source_record_id TEXT, + raw_sources JSONB + ); + """) + cur.execute(""" + CREATE INDEX IF NOT EXISTS idx_places_osm + ON places(osm_type, osm_id) WHERE osm_type IS NOT NULL; + """) + cur.execute(""" + CREATE INDEX IF NOT EXISTS idx_places_geom + ON places USING GIST(geometry); + """) + cur.execute(""" + CREATE INDEX IF NOT EXISTS idx_places_name_trgm + ON places USING GIN(name gin_trgm_ops); + """) + conn.commit() + log.info('Table and indexes ready') + + +def parse_osm_ref(sources): + """Extract OSM type letter and ID from Overture sources array.""" + if not sources: + return None, None, None + for src in sources: + record_id = None + if isinstance(src, dict): + record_id = src.get('record_id', '') + elif hasattr(src, '__getitem__'): + # DuckDB struct — try attribute access + try: + record_id = src['record_id'] + except (KeyError, TypeError, IndexError): + pass + if not record_id: + continue + m = OSM_RECORD_RE.match(str(record_id)) + if m: + return m.group(1), int(m.group(2)), str(record_id) + return None, None, None + + +def run_import(): + """Main import: DuckDB reads S3 Parquet → PostgreSQL via chunked OFFSET/LIMIT.""" + log.info(f'Overture release: {OVERTURE_RELEASE}') + log.info(f'S3 path: {S3_PATH}') + log.info(f'Bounding box: {BBOX}') + + # Connect to PostgreSQL + conn = psycopg2.connect(**DB_CONFIG) + conn.autocommit = False + create_table(conn) + + # Set up DuckDB with httpfs and spatial for S3 access + duck = duckdb.connect() + duck.execute("INSTALL httpfs; LOAD httpfs;") + duck.execute("INSTALL spatial; LOAD spatial;") + duck.execute("SET s3_region='us-west-2';") + + # Use a materialized approach: DuckDB query → Arrow → iterate in Python + query = f""" + SELECT + id, + ST_X(geometry) AS lon, + ST_Y(geometry) AS lat, + names.primary AS name, + basic_category, + confidence, + phones, + websites, + socials, + brand, + sources + FROM read_parquet('{S3_PATH}', hive_partitioning=true) + WHERE bbox.xmin >= {BBOX['xmin']} + AND bbox.xmax <= {BBOX['xmax']} + AND bbox.ymin >= {BBOX['ymin']} + AND bbox.ymax <= {BBOX['ymax']} + """ + + log.info('Starting DuckDB query against S3 (this will take several minutes)...') + t_start = time.time() + + # Execute and fetch all as Arrow for efficient iteration + result_rel = duck.sql(query) + + upsert_sql = """ + INSERT INTO places (id, geometry, name, basic_category, confidence, + phone, website, socials, brand_name, brand_wikidata, + osm_type, osm_id, source_record_id, raw_sources) + VALUES %s + ON CONFLICT (id) DO UPDATE SET + geometry = EXCLUDED.geometry, + name = EXCLUDED.name, + basic_category = EXCLUDED.basic_category, + confidence = EXCLUDED.confidence, + phone = EXCLUDED.phone, + website = EXCLUDED.website, + socials = EXCLUDED.socials, + brand_name = EXCLUDED.brand_name, + brand_wikidata = EXCLUDED.brand_wikidata, + osm_type = EXCLUDED.osm_type, + osm_id = EXCLUDED.osm_id, + source_record_id = EXCLUDED.source_record_id, + raw_sources = EXCLUDED.raw_sources + """ + + template = """( + %(id)s, + ST_SetSRID(ST_MakePoint(%(lon)s, %(lat)s), 4326), + %(name)s, + %(basic_category)s, + %(confidence)s, + %(phone)s, + %(website)s, + %(socials)s::jsonb, + %(brand_name)s, + %(brand_wikidata)s, + %(osm_type)s, + %(osm_id)s, + %(source_record_id)s, + %(raw_sources)s::jsonb + )""" + + total = 0 + osm_refs = 0 + batch = [] + + log.info('DuckDB query executing, fetching results in chunks...') + + # Fetch in chunks using fetchmany on the relation + chunk_size = BATCH_SIZE + while True: + chunk = result_rel.fetchmany(chunk_size) + if not chunk: + break + + for row in chunk: + row_id = row[0] + lon = row[1] + lat = row[2] + name = row[3] + basic_cat = row[4] + conf = row[5] + phones = row[6] + websites = row[7] + socials_raw = row[8] + brand_raw = row[9] + sources_raw = row[10] + + if lon is None or lat is None: + continue + + # Phone: first element of VARCHAR[] + phone = None + if phones and len(phones) > 0: + phone = str(phones[0]) if phones[0] else None + + # Website: first element of VARCHAR[] + website = None + if websites and len(websites) > 0: + website = str(websites[0]) if websites[0] else None + + # Socials: VARCHAR[] → JSON array of strings + socials_json = None + if socials_raw and len(socials_raw) > 0: + socials_json = json.dumps([str(s) for s in socials_raw if s]) + + # Brand: struct with wikidata and names.primary + brand_name = None + brand_wikidata = None + if brand_raw: + try: + if isinstance(brand_raw, dict): + brand_wikidata = brand_raw.get('wikidata') + names_struct = brand_raw.get('names') + if names_struct and isinstance(names_struct, dict): + brand_name = names_struct.get('primary') + else: + # DuckDB struct — access by key + brand_wikidata = brand_raw['wikidata'] if 'wikidata' in dir(brand_raw) else None + try: + brand_wikidata = brand_raw[0] # wikidata is first field + names_struct = brand_raw[1] # names is second field + if names_struct: + brand_name = names_struct[0] # primary is first field + except (IndexError, TypeError): + pass + except Exception: + pass + + # Sources: parse OSM cross-reference + sources_list = None + if sources_raw: + if isinstance(sources_raw, (list, tuple)): + sources_list = [] + for s in sources_raw: + if isinstance(s, dict): + sources_list.append(s) + else: + # DuckDB struct tuple — convert + try: + sources_list.append({ + 'dataset': s[1] if len(s) > 1 else None, + 'record_id': s[3] if len(s) > 3 else None, + }) + except (TypeError, IndexError): + pass + + osm_type_letter, osm_id_val, source_record_id = parse_osm_ref(sources_list) + if osm_type_letter: + osm_refs += 1 + + raw_sources_json = json.dumps(sources_list) if sources_list else None + + batch.append({ + 'id': row_id, + 'lon': float(lon), + 'lat': float(lat), + 'name': name, + 'basic_category': basic_cat, + 'confidence': float(conf) if conf is not None else None, + 'phone': phone, + 'website': website, + 'socials': socials_json, + 'brand_name': brand_name, + 'brand_wikidata': brand_wikidata, + 'osm_type': osm_type_letter, + 'osm_id': osm_id_val, + 'source_record_id': source_record_id, + 'raw_sources': raw_sources_json, + }) + + if len(batch) >= BATCH_SIZE: + with conn.cursor() as cur: + psycopg2.extras.execute_values( + cur, upsert_sql, batch, + template=template, + page_size=BATCH_SIZE + ) + conn.commit() + total += len(batch) + elapsed = time.time() - t_start + rate = total / elapsed if elapsed > 0 else 0 + log.info(f'Inserted {total:,} rows ({osm_refs:,} OSM xrefs) ' + f'[{rate:.0f} rows/sec, {elapsed:.0f}s elapsed]') + batch = [] + + # Flush remaining + if batch: + with conn.cursor() as cur: + psycopg2.extras.execute_values( + cur, upsert_sql, batch, + template=template, + page_size=BATCH_SIZE + ) + conn.commit() + total += len(batch) + + duck.close() + + # Final stats + elapsed = time.time() - t_start + log.info(f'Import complete: {total:,} rows, {osm_refs:,} OSM cross-refs, ' + f'{elapsed:.0f}s total ({total/elapsed:.0f} rows/sec)') + + # Verify + with conn.cursor() as cur: + cur.execute("SELECT count(*) FROM places") + count = cur.fetchone()[0] + cur.execute("SELECT count(*) FROM places WHERE osm_type IS NOT NULL") + osm_count = cur.fetchone()[0] + log.info(f'Final table: {count:,} total rows, {osm_count:,} with OSM cross-references') + + conn.close() + + +if __name__ == '__main__': + run_import() diff --git a/templates/base.html b/templates/base.html index 49b1a21..4c06892 100644 --- a/templates/base.html +++ b/templates/base.html @@ -21,6 +21,7 @@ PeerTube Kiwix Search + Nav-I Settings {% if subnav %} diff --git a/templates/knowledge/deleted_contacts.html b/templates/knowledge/deleted_contacts.html new file mode 100644 index 0000000..58a9ff5 --- /dev/null +++ b/templates/knowledge/deleted_contacts.html @@ -0,0 +1,56 @@ +{% extends "base.html" %} +{% block content %} +

Deleted Contacts

+{% if not contacts %} +

No deleted contacts.

+{% else %} + + + {% for c in contacts %} + + + + + + + + + {% endfor %} +
LabelNameCategoryPhoneDeleted AtActions
{{ c.label }}{{ c.name or '' }}{{ c.category or '' }}{{ c.phone or '' }}{{ c.deleted_at or '' }} + + +
+{% endif %} +{% endblock %} +{% block scripts %} + +{% endblock %} diff --git a/templates/navi/api_keys.html b/templates/navi/api_keys.html new file mode 100644 index 0000000..abf2d16 --- /dev/null +++ b/templates/navi/api_keys.html @@ -0,0 +1,269 @@ +{% extends "base.html" %} +{% block content %} +

API Keys

+ +
+

Updating keys does not restart RECON. After updates, click Restart RECON below or restart manually from terminal.

+
+ +
Loading keys...
+ + + + + + + + + + + +
+ + +
+ + + +{% endblock %} + +{% block scripts %} + +{% endblock %} diff --git a/templates/navi/deleted_contacts.html b/templates/navi/deleted_contacts.html new file mode 100644 index 0000000..0847fab --- /dev/null +++ b/templates/navi/deleted_contacts.html @@ -0,0 +1,116 @@ +{% extends "base.html" %} +{% block content %} +

Deleted Contacts

+{% if not contacts %} +

No deleted contacts.

+{% else %} + + + {% for c in contacts %} + + + + + + + + + {% endfor %} +
LabelNameCategoryPhoneDeleted AtActions
{{ c.label }}{{ c.name or '' }}{{ c.category or '' }}{{ c.phone or '' }}{{ c.deleted_at or '' }} + + +
+{% endif %} + + + +{% endblock %} +{% block scripts %} + +{% endblock %} diff --git a/templates/navi/landing.html b/templates/navi/landing.html new file mode 100644 index 0000000..131f3af --- /dev/null +++ b/templates/navi/landing.html @@ -0,0 +1,22 @@ +{% extends "base.html" %} +{% block content %} +

Nav-I

+

Navi frontend management — contacts, API keys, and configuration.

+ + +{% endblock %} diff --git a/tools/recon_rag_tool.py b/tools/recon_rag_tool.py new file mode 100644 index 0000000..c835864 --- /dev/null +++ b/tools/recon_rag_tool.py @@ -0,0 +1,1653 @@ +""" +title: RECON Knowledge Base +author: Echo6 +version: 5.0.0 +description: RAG filter with three-tier cascade: Qdrant (domain knowledge) → Kiwix (offline wiki) → SearXNG (web search). Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution. +""" + +import logging +import json +import math +import re +import threading +import html +from datetime import datetime +from html.parser import HTMLParser +from pathlib import Path +from typing import Optional, Callable, Awaitable +from concurrent.futures import ThreadPoolExecutor, as_completed +from urllib.parse import quote, unquote + +import requests +from pydantic import BaseModel, Field + +log = logging.getLogger(__name__) + +# Module-level source store: keyed by chat_id so inlet/outlet share state +# even if OWI instantiates separate Filter objects per call. +_SOURCE_STORE: dict[str, list] = {} + +# ── CASCADE CONFIGURATION (v5.0.0) ─────────────────────────────────────────── +# FlashRank score threshold for Tier 1 (Qdrant). Below this, fall through to Tier 2. +# Based on calibration: RECON queries cluster at 0.95-1.0, misses below 0.3. +# 0.5 is conservative - will let more through to Kiwix than strictly necessary. +CASCADE_CONFIDENCE_THRESHOLD = 0.5 + +# Kiwix-serve configuration +KIWIX_BASE_URL = "http://localhost:8430" +KIWIX_SEARCH_TIMEOUT = 5 # seconds +KIWIX_ARTICLE_TIMEOUT = 5 # seconds +KIWIX_MAX_RESULTS = 3 + +# SearXNG configuration +SEARXNG_URL = "http://192.168.1.102:8080" +SEARXNG_TIMEOUT = 5 # seconds +SEARXNG_MAX_RESULTS = 5 + +# Cascade logging +CASCADE_LOG_PATH = Path("/opt/recon/logs/cascade.jsonl") + +# ── Semantic Query Router (v4.3.0) ─────────────────────────────────────────── +ROUTE_EXAMPLES = { + "nav_route": [ + "how do I get to Boise", + "directions to Twin Falls", + "how do I get from Buhl to Boise", + "drive from Jerome to Sun Valley", + "route from Boise to McCall", + "what's the fastest way to Sun Valley", + "how far is it to Twin Falls", + "take me to Shoshone", + "navigate to the airport", + "how do I drive to Salt Lake City", + "walking directions to the park", + "bike route to downtown", + ], + "nav_reverse_geocode": [ + "what town is at 42.5, -114.7", + "where am I right now", + "what is at coordinates 43.6, -116.2", + "what location is 42.574, -114.607", + "where is this place 44.0, -114.3", + "what city is near 42.7, -114.5", + "reverse geocode 43.0, -115.0", + "what's at this location 42.9, -114.8", + ], + "direct_answer": [ + "hello", + "hey aurora", + "good morning", + "thanks", + "thank you", + "what's your name", + "who are you", + "tell me a joke", + "how are you", + "hi there", + ], + "rag_search": [ + "what does the survival manual say about water", + "how to purify water in the field", + "how to treat a gunshot wound", + "what is the ranger handbook chapter on patrolling", + "field manual water purification", + "how to build a shelter in the wilderness", + "tactical combat casualty care procedures", + "what does FM 21-76 say about fire starting", + ], +} + +_ROUTE_CENTROIDS: dict | None = None +_ROUTER_LOCK = threading.Lock() + + +def _embed_batch_router(texts: list[str], tei_url: str) -> list[list[float]]: + resp = requests.post(tei_url, json={"inputs": texts}, timeout=30) + resp.raise_for_status() + return resp.json() + + +def _compute_centroid(vectors: list[list[float]]) -> list[float]: + n = len(vectors) + dim = len(vectors[0]) + centroid = [0.0] * dim + for vec in vectors: + for i in range(dim): + centroid[i] += vec[i] + for i in range(dim): + centroid[i] /= n + return centroid + + +def _cosine_similarity(a: list[float], b: list[float]) -> float: + dot = 0.0 + norm_a = 0.0 + norm_b = 0.0 + for i in range(len(a)): + dot += a[i] * b[i] + norm_a += a[i] * a[i] + norm_b += b[i] * b[i] + denom = math.sqrt(norm_a) * math.sqrt(norm_b) + if denom == 0: + return 0.0 + return dot / denom + + +def _ensure_centroids(tei_url: str) -> dict[str, list[float]]: + global _ROUTE_CENTROIDS + if _ROUTE_CENTROIDS is not None: + return _ROUTE_CENTROIDS + with _ROUTER_LOCK: + if _ROUTE_CENTROIDS is not None: + return _ROUTE_CENTROIDS + all_texts = [] + route_ranges: dict[str, tuple[int, int]] = {} + offset = 0 + for route, examples in ROUTE_EXAMPLES.items(): + route_ranges[route] = (offset, offset + len(examples)) + all_texts.extend(examples) + offset += len(examples) + all_vectors = _embed_batch_router(all_texts, tei_url) + centroids = {} + for route, (start, end) in route_ranges.items(): + centroids[route] = _compute_centroid(all_vectors[start:end]) + _ROUTE_CENTROIDS = centroids + return _ROUTE_CENTROIDS + + +def _classify_query( + query: str, + tei_url: str, + threshold: float = 0.45, +) -> tuple[str, float]: + """Classify query intent. Returns ("rag_search", 0.0) on any failure.""" + try: + centroids = _ensure_centroids(tei_url) + vecs = _embed_batch_router([query], tei_url) + query_vec = vecs[0] + best_route = "rag_search" + best_score = 0.0 + for route, centroid in centroids.items(): + sim = _cosine_similarity(query_vec, centroid) + if sim > best_score: + best_score = sim + best_route = route + if best_score < threshold: + return ("rag_search", best_score) + return (best_route, best_score) + except Exception as e: + log.warning(f"Router classification failed: {e}") + return ("rag_search", 0.0) + + +# ── Navigation handlers (v4.3.0) ───────────────────────────────────────────── +_COORD_RE = re.compile(r'^(-?\d+\.?\d*)\s*,\s*(-?\d+\.?\d*)$') +_FROM_TO_RE = re.compile(r'from\s+(.+?)\s+to\s+(.+?)(?:\s+by\s+\w+)?$', re.IGNORECASE) +_TO_RE = re.compile(r'(?:to|towards?)\s+(?:the\s+)?(.+?)$', re.IGNORECASE) +_COORD_IN_TEXT_RE = re.compile(r'(-?\d+\.?\d+)\s*,\s*(-?\d+\.?\d+)') +_MODE_MAP = { + "walk": "pedestrian", "walking": "pedestrian", "foot": "pedestrian", "pedestrian": "pedestrian", + "bike": "bicycle", "cycling": "bicycle", "bicycle": "bicycle", "cycle": "bicycle", + "truck": "truck", "lorry": "truck", + "drive": "auto", "driving": "auto", "car": "auto", "auto": "auto", +} + + +def _detect_mode(query: str) -> str: + q = query.lower() + for keyword, mode in _MODE_MAP.items(): + if keyword in q: + return mode + return "auto" + + +def _clean_place(text: str) -> str: + """Clean a place string for geocoding: strip articles, punctuation, normalize 'in' to comma.""" + s = text.strip().rstrip('?.,!') + # Strip leading articles + s = re.sub(r'^(the|a|an)\s+', '', s, flags=re.IGNORECASE) + # "214 North St in Filer ID" → "214 North St, Filer, ID" + s = re.sub(r'\s+in\s+', ', ', s, count=1, flags=re.IGNORECASE) + return s.strip() + + +def _parse_nav_query(query: str) -> tuple[str, str, str] | None: + mode = _detect_mode(query) + m = _FROM_TO_RE.search(query) + if m: + return (_clean_place(m.group(1)), _clean_place(m.group(2)), mode) + m = _TO_RE.search(query) + if m: + dest = _clean_place(m.group(1)) + if dest: + return (None, dest, mode) + return None + + +def _geocode(query: str, photon_url: str, address_book_url: str = "") -> tuple[float, float, str] | tuple[None, None, None]: + m = _COORD_RE.match(query.strip()) + if m: + lat, lon = float(m.group(1)), float(m.group(2)) + return lat, lon, query + # Address book lookup (before Photon) + ab = _address_book_lookup(query, address_book_url) + if ab: + return ab['lat'], ab['lon'], ab.get('address') or ab['name'] + resp = requests.get( + f"{photon_url}/api", + params={"q": query, "limit": 1}, + timeout=10, + ) + resp.raise_for_status() + features = resp.json().get("features", []) + if not features: + return None, None, None + props = features[0]["properties"] + coords = features[0]["geometry"]["coordinates"] + parts = [props.get("name", "")] + for key in ("city", "state", "country"): + v = props.get(key) + if v and v != parts[-1]: + parts.append(v) + return coords[1], coords[0], ", ".join(p for p in parts if p) + + +def _route_valhalla( + orig: tuple[float, float], + dest: tuple[float, float], + mode: str, + valhalla_url: str, +) -> str | None: + try: + resp = requests.post( + f"{valhalla_url}/route", + json={ + "locations": [ + {"lat": orig[0], "lon": orig[1]}, + {"lat": dest[0], "lon": dest[1]}, + ], + "costing": mode, + "directions_options": {"units": "miles"}, + }, + timeout=30, + ) + except requests.RequestException: + return None + if resp.status_code != 200: + return None + trip = resp.json()["trip"] + summary = trip["summary"] + legs = trip["legs"][0]["maneuvers"] + miles = round(summary["length"], 1) + minutes = round(summary["time"] / 60, 1) + lines = [f"Distance: {miles} miles | Time: {minutes} minutes", ""] + for i, m in enumerate(legs, 1): + inst = m["instruction"] + dist = m.get("length", 0) + if dist > 0: + lines.append(f"{i}. {inst} — {round(dist, 1)} mi") + else: + lines.append(f"{i}. {inst}") + return "\n".join(lines) + + +def _handle_nav_route( + query: str, + photon_url: str, + valhalla_url: str, + default_origin: str, + address_book_url: str = "", +) -> str | None: + parsed = _parse_nav_query(query) + if not parsed: + return None + origin_str, dest_str, mode = parsed + if not origin_str: + origin_str = default_origin + orig_lat, orig_lon, orig_name = _geocode(origin_str, photon_url, address_book_url) + if orig_lat is None: + return None + dest_lat, dest_lon, dest_name = _geocode(dest_str, photon_url, address_book_url) + if dest_lat is None: + return None + directions = _route_valhalla( + (orig_lat, orig_lon), (dest_lat, dest_lon), mode, valhalla_url + ) + if not directions: + return None + return f"Directions from {orig_name} to {dest_name} ({mode}):\n{directions}" + + +def _handle_reverse_geocode(query: str, photon_url: str) -> str | None: + m = _COORD_IN_TEXT_RE.search(query) + if not m: + return None + lat, lon = float(m.group(1)), float(m.group(2)) + try: + resp = requests.get( + f"{photon_url}/reverse", + params={"lat": lat, "lon": lon, "limit": 1}, + timeout=10, + ) + resp.raise_for_status() + features = resp.json().get("features", []) + if not features: + return f"No location found near coordinates ({lat}, {lon})" + props = features[0]["properties"] + parts = [] + for key in ("name", "city", "state", "country"): + v = props.get(key) + if v and v not in parts: + parts.append(v) + display = ", ".join(parts) if parts else "Unknown location" + return f"Location: {display} ({lat}, {lon})" + except Exception: + return None + + +def _inject_nav_context(body: dict, context: str): + messages = body.get("messages", []) + nav_block = ( + "\n\n---NAVIGATION RESULT---\n\n" + f"{context}\n\n" + "---END NAVIGATION RESULT---\n\n" + "Present these directions to the user exactly as provided. " + "Do not summarize or omit steps. You may add brief contextual notes." + ) + system_msg = next((m for m in messages if m.get("role") == "system"), None) + if system_msg: + system_msg["content"] = system_msg["content"] + nav_block + else: + body["messages"].insert(0, {"role": "system", "content": nav_block}) + + + +def _address_book_lookup(query: str, address_book_url: str) -> dict | None: + """Check RECON address book for exact place match. Returns dict with lat/lon or None.""" + if not address_book_url: + return None + try: + resp = requests.get( + f"{address_book_url}/api/address_book/lookup", + params={"q": query}, + timeout=2, + ) + if resp.status_code == 200: + data = resp.json() + if data.get("confidence") == "exact" and data.get("lat") and data.get("lon"): + log.info(f"Address book hit: {query!r} → {data['name']} ({data['lat']}, {data['lon']})") + return data + return None + except Exception: + return None + + +# ── End router/nav code ────────────────────────────────────────────────────── + +# ── Kiwix Search Helpers (v5.0.0) ──────────────────────────────────────────── + +class _KiwixResultParser(HTMLParser): + """Parse Kiwix search results HTML to extract articles.""" + def __init__(self): + super().__init__() + self.results = [] + self._in_results = False + self._in_li = False + self._in_cite = False + self._in_info = False + self._current = {} + self._capture_text = False + + def handle_starttag(self, tag, attrs): + attrs_dict = dict(attrs) + if tag == "div" and "results" in attrs_dict.get("class", ""): + self._in_results = True + elif self._in_results and tag == "li": + self._in_li = True + self._current = {"title": "", "url": "", "snippet": "", "word_count": ""} + elif self._in_li and tag == "a" and not self._current.get("url"): + self._current["url"] = attrs_dict.get("href", "") + self._capture_text = True + elif self._in_li and tag == "cite": + self._in_cite = True + self._capture_text = True + elif self._in_li and tag == "div" and "informations" in attrs_dict.get("class", ""): + self._in_info = True + self._capture_text = True + + def handle_endtag(self, tag): + if tag == "div" and self._in_results and not self._in_li: + self._in_results = False + elif tag == "li" and self._in_li: + if self._current.get("url"): + self.results.append(self._current) + self._current = {} + self._in_li = False + elif tag == "a" and self._capture_text and not self._in_cite: + self._capture_text = False + elif tag == "cite": + self._in_cite = False + self._capture_text = False + elif tag == "div" and self._in_info: + self._in_info = False + self._capture_text = False + + def handle_data(self, data): + if self._capture_text and self._in_li: + text = data.strip() + if self._in_cite: + self._current["snippet"] += text + " " + elif self._in_info: + self._current["word_count"] = text + elif not self._current.get("title"): + self._current["title"] = text + + +def _strip_html_tags(html_content: str) -> str: + """Simple HTML to plain text conversion using stdlib.""" + # Remove script and style elements + text = re.sub(r']*>.*?', '', html_content, flags=re.DOTALL | re.IGNORECASE) + text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) + # Remove tags + text = re.sub(r'<[^>]+>', ' ', text) + # Decode entities + text = html.unescape(text) + # Normalize whitespace + text = re.sub(r'\s+', ' ', text).strip() + return text + + +def _fetch_kiwix_books() -> list[str]: + """Fetch list of available books from kiwix-serve catalog.""" + try: + resp = requests.get( + f"{KIWIX_BASE_URL}/catalog/v2/entries", + timeout=KIWIX_SEARCH_TIMEOUT, + ) + resp.raise_for_status() + # Extract book names from href attributes + books = re.findall(r'href="/content/([^"]+)"', resp.text) + return list(set(books)) # dedupe + except Exception as e: + log.warning(f"Failed to fetch Kiwix book list: {e}") + return [] + + +def _search_kiwix_book(book: str, query: str, limit: int = 5) -> list[dict]: + """Search a single Kiwix book and return results.""" + try: + resp = requests.get( + f"{KIWIX_BASE_URL}/search", + params={"content": book, "pattern": query, "limit": limit}, + timeout=KIWIX_SEARCH_TIMEOUT, + ) + if resp.status_code != 200: + return [] + + parser = _KiwixResultParser() + parser.feed(resp.text) + + # Add book name to results + for r in parser.results: + r["book"] = book + + return parser.results + except Exception as e: + log.warning(f"Kiwix search failed for {book}: {e}") + return [] + + +def _fetch_kiwix_article(url_path: str) -> str: + """Fetch and extract text content from a Kiwix article.""" + try: + resp = requests.get( + f"{KIWIX_BASE_URL}{url_path}", + timeout=KIWIX_ARTICLE_TIMEOUT, + ) + resp.raise_for_status() + + # Extract main content - try to find article body + content = resp.text + + # Try to extract just the main content area + main_match = re.search(r']*>(.*?)', content, re.DOTALL | re.IGNORECASE) + if main_match: + content = main_match.group(1) + else: + # Try article tag + article_match = re.search(r']*>(.*?)', content, re.DOTALL | re.IGNORECASE) + if article_match: + content = article_match.group(1) + else: + # Try body content div + body_match = re.search(r']*class="[^"]*content[^"]*"[^>]*>(.*?)', content, re.DOTALL | re.IGNORECASE) + if body_match: + content = body_match.group(1) + + return _strip_html_tags(content)[:4000] # Limit to 4000 chars + except Exception as e: + log.warning(f"Failed to fetch Kiwix article {url_path}: {e}") + return "" + + +def _search_kiwix(query: str, books: list[str]) -> list[dict]: + """Search Kiwix across specified books and return merged results.""" + all_results = [] + + # Prioritize English Wikipedia and other English content + priority_books = [] + other_books = [] + for book in books: + if "wikipedia_en" in book or "_en_" in book or "_eng_" in book: + priority_books.append(book) + elif not any(lang in book for lang in ["_af_", "_de_", "_fr_", "_es_"]): + other_books.append(book) + + # Search priority books first + for book in priority_books[:3]: # Limit to top 3 priority books + results = _search_kiwix_book(book, query, limit=5) + all_results.extend(results) + + # If not enough results, try other books + if len(all_results) < KIWIX_MAX_RESULTS: + for book in other_books[:2]: + results = _search_kiwix_book(book, query, limit=3) + all_results.extend(results) + + return all_results[:KIWIX_MAX_RESULTS * 2] # Return up to 6 for further filtering + + +# ── SearXNG Search Helpers (v5.0.0) ────────────────────────────────────────── + +def _search_searxng(query: str) -> list[dict]: + """Search SearXNG and return results. Returns empty list on failure.""" + try: + resp = requests.get( + f"{SEARXNG_URL}/search", + params={"q": query, "format": "json"}, + timeout=SEARXNG_TIMEOUT, + ) + if resp.status_code != 200: + log.warning(f"SearXNG returned status {resp.status_code}") + return [] + + data = resp.json() + results = data.get("results", []) + + # Format results + formatted = [] + for r in results[:SEARXNG_MAX_RESULTS]: + formatted.append({ + "title": r.get("title", ""), + "url": r.get("url", ""), + "snippet": r.get("content", ""), + "engines": r.get("engines", []), + "score": r.get("score", 0), + }) + + return formatted + except requests.Timeout: + log.warning("SearXNG request timed out (offline or slow)") + return [] + except requests.ConnectionError: + log.warning("SearXNG connection failed (offline)") + return [] + except Exception as e: + log.warning(f"SearXNG search failed: {e}") + return [] + + +# ── Cascade Logging (v5.0.0) ───────────────────────────────────────────────── + +def _log_cascade_decision( + query: str, + router_intent: str, + top_1_score: float, + tier_used: int, + num_results: int, +): + """Log cascade decision to JSONL file for threshold tuning.""" + try: + CASCADE_LOG_PATH.parent.mkdir(parents=True, exist_ok=True) + entry = { + "timestamp": datetime.utcnow().isoformat() + "Z", + "query": query, + "router_intent": router_intent, + "top_1_score": round(top_1_score, 4), + "tier_used": tier_used, + "num_results": num_results, + } + with open(CASCADE_LOG_PATH, "a") as f: + f.write(json.dumps(entry) + "\n") + except Exception as e: + log.warning(f"Failed to log cascade decision: {e}") + + +# ── End cascade helpers ────────────────────────────────────────────────────── + +# Subdomains excluded from Medical results when tactical context detected +_OBSTETRIC_SUBDOMAINS = [ + "Obstetrics", "Midwifery", "Pregnancy", "Pregnancy Care", + "High-Risk Pregnancy", "Childbirth", "Postpartum Care", + "Family Planning", "Contraception", "Breastfeeding", + "Labor Complications", "Twin Delivery", +] + +# Query intent patterns — compiled once at import time +_PROCEDURAL_RE = re.compile( + r"^(how\s+(do|can|should|would|to)\b|steps?\s+(to|for)\b|procedure\s+for\b|technique\s+for\b|way\s+to\b|method\s+(to|for)\b|guide\s+(to|for)\b|instructions?\s+for\b)", + re.IGNORECASE, +) +_FOUNDATIONAL_RE = re.compile( + r"^(what\s+(is|are|does|was|were)\b|explain\b|define\b|why\s+(does|do|is|are|did)\b|describe\b|meaning\s+of\b|difference\s+between\b)", + re.IGNORECASE, +) + +# Tactical keyword patterns for obstetric subdomain exclusion +_TACTICAL_RE = re.compile( + r"\b(tactical|combat|tccc|casevac|medevac|casualty|triage|tourniquet|hemorrhage|wound packing|chest seal|care under fire|point of injury|far forward|buddy aid|self aid|field care|9-line|march algorithm)\b", + re.IGNORECASE, +) + + +def _rerank_by_keyword_overlap(query: str, results: list) -> list: + """Rerank results by boosting those with query term overlap in content/summary/key_facts. + + Adds a boost of up to 0.15 based on the fraction of query tokens found in the result text. + Results are re-sorted by boosted score. + """ + q_tokens = set(re.findall(r'[a-z0-9][-a-z0-9]{2,}', query.lower())) + if not q_tokens: + return results + + reranked = [] + for r in results: + p = r.get("payload", {}) + score = r.get("score", 0) + + # Build searchable text from content, summary, and key_facts + parts = [] + content = p.get("content", "") + if content: + parts.append(content[:2000].lower()) + summary = p.get("summary", "") + if summary: + parts.append(summary.lower()) + key_facts = p.get("key_facts", []) + if isinstance(key_facts, list): + parts.append(" ".join(str(f) for f in key_facts).lower()) + searchable = " ".join(parts) + + # Count how many query tokens appear in the result + if searchable: + matches = sum(1 for t in q_tokens if t in searchable) + overlap_ratio = matches / len(q_tokens) + else: + overlap_ratio = 0 + + # Boost: up to 0.15 for perfect overlap + boosted_score = score + (overlap_ratio * 0.15) + reranked.append({**r, "score": boosted_score}) + + reranked.sort(key=lambda x: -x["score"]) + return reranked + + +class Filter: + class Valves(BaseModel): + tei_url: str = Field( + default="http://100.64.0.14:8090/embed", + description="TEI embedding endpoint", + ) + qdrant_url: str = Field( + default="http://100.64.0.14:6333", + description="Qdrant REST API base URL", + ) + collection: str = Field( + default="recon_knowledge_hybrid", + description="Qdrant collection name", + ) + top_k: int = Field( + default=8, + description="Number of results to retrieve", + ) + score_threshold: float = Field( + default=0.3, + description="Minimum similarity score to include a result", + ) + fallback_min: int = Field( + default=3, + description="Minimum filtered results before falling back to unfiltered search", + ) + candidate_limit: int = Field( + default=50, + description="Initial retrieval pool size for reranking", + ) + rerank_top_n: int = Field( + default=20, + description="Keep top N after FlashRank reranking", + ) + mmr_diversity: float = Field( + default=0.3, + description="MMR diversity 0-1 (0=pure relevance, 1=max diversity)", + ) + enabled: bool = Field( + default=True, + description="Enable/disable RECON RAG augmentation", + ) + priority: int = Field( + default=0, + description="Filter execution priority (lower = earlier)", + ) + router_enabled: bool = Field( + default=True, + description="Enable semantic query routing", + ) + router_threshold: float = Field( + default=0.45, + description="Min confidence for route classification", + ) + photon_url: str = Field( + default="http://100.64.0.24:2322", + description="Photon geocoder URL", + ) + valhalla_url: str = Field( + default="http://100.64.0.24:8002", + description="Valhalla routing URL", + ) + address_book_url: str = Field( + default="http://100.64.0.24:8420", + description="RECON address book API base URL", + ) + cascade_enabled: bool = Field( + default=True, + description="Enable three-tier cascade (Qdrant → Kiwix → SearXNG)", + ) + cascade_threshold: float = Field( + default=0.5, + description="FlashRank score threshold for cascade fallthrough", + ) + + def __init__(self): + self.valves = self.Valves() + self._expansion_cache: dict[str, list[str]] = {} + self._ranker = None + self._kiwix_books: list[str] | None = None + + def _get_kiwix_books(self) -> list[str]: + """Get cached list of Kiwix books, fetching on first use.""" + if self._kiwix_books is None: + self._kiwix_books = _fetch_kiwix_books() + log.info(f"Loaded {len(self._kiwix_books)} Kiwix books") + return self._kiwix_books + + def _embed_query(self, text: str) -> list: + """Embed a query string using TEI.""" + resp = requests.post( + self.valves.tei_url, + json={"inputs": text}, + timeout=30, + ) + resp.raise_for_status() + return resp.json()[0] + + def _get_ranker(self): + """Lazy-load FlashRank neural reranker.""" + if self._ranker is None: + from flashrank import Ranker + self._ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="/tmp/flashrank") + return self._ranker + + def _rerank_flashrank(self, query: str, results: list) -> list: + """Rerank results using FlashRank neural reranker. + + Takes Qdrant REST API result dicts (with 'payload' and 'score' keys). + Returns reranked list with updated scores, trimmed to rerank_top_n. + """ + from flashrank import RerankRequest + + ranker = self._get_ranker() + + passages = [] + for i, r in enumerate(results): + p = r.get("payload", {}) + text = p.get("content", "") + if not text: + text = p.get("summary", "") + passages.append({"id": i, "text": text[:2048]}) + + if not passages: + return results + + request = RerankRequest(query=query, passages=passages) + ranked = ranker.rerank(request) + + reranked = [] + for item in ranked[:self.valves.rerank_top_n]: + idx = item["id"] + result_copy = dict(results[idx]) + result_copy["score"] = float(item["score"]) + reranked.append(result_copy) + + return reranked + + def _mmr_select(self, candidates: list, final_k: int) -> list: + """Select final_k results using Maximal Marginal Relevance. + + Penalizes redundancy: same book_title (0.6), same domain (0.3), same source_type (0.1). + Works with Qdrant REST API result dicts. + """ + if len(candidates) <= final_k: + return candidates + + selected = [candidates[0]] + remaining = list(candidates[1:]) + + while len(selected) < final_k and remaining: + best_score = -999 + best_idx = 0 + + for i, candidate in enumerate(remaining): + relevance = candidate.get("score", 0) + cp = candidate.get("payload", {}) + + max_overlap = 0 + for sel in selected: + sp = sel.get("payload", {}) + overlap = 0 + + c_title = cp.get("book_title", "") + s_title = sp.get("book_title", "") + if c_title and s_title and c_title == s_title: + overlap += 0.6 + + c_domain = cp.get("domain", "") + s_domain = sp.get("domain", "") + if c_domain and s_domain and c_domain == s_domain: + overlap += 0.3 + + c_src = cp.get("source_type", "") + s_src = sp.get("source_type", "") + if c_src and s_src and c_src == s_src: + overlap += 0.1 + + max_overlap = max(max_overlap, overlap) + + diversity = self.valves.mmr_diversity + mmr_score = (1 - diversity) * relevance - diversity * max_overlap + + if mmr_score > best_score: + best_score = mmr_score + best_idx = i + + selected.append(remaining.pop(best_idx)) + + return selected + + @staticmethod + def _detect_intent(query: str) -> Optional[list]: + """Detect query intent and return preferred knowledge_types, or None for unfiltered.""" + q = query.strip() + if _PROCEDURAL_RE.search(q): + return ["procedural", "operational"] + if _FOUNDATIONAL_RE.search(q): + return ["foundational"] + return None + + def _search_qdrant( + self, + vector: list, + limit: int, + knowledge_types: Optional[list] = None, + domain: Optional[str] = None, + exclude_subdomains: Optional[list] = None, + ) -> list: + """Search Qdrant for similar vectors, optionally filtered by knowledge_type and/or domain.""" + url = f"{self.valves.qdrant_url}/collections/{self.valves.collection}/points/search" + payload = { + "vector": vector, + "limit": limit, + "with_payload": True, + "score_threshold": self.valves.score_threshold, + } + + must_clauses = [] + must_not_clauses = [] + should_clauses = [] + + if domain: + must_clauses.append({"key": "domain", "match": {"value": domain}}) + + if knowledge_types: + for kt in knowledge_types: + should_clauses.append({"key": "knowledge_type", "match": {"value": kt}}) + + if exclude_subdomains: + for sd in exclude_subdomains: + must_not_clauses.append({"key": "subdomain", "match": {"value": sd}}) + + if must_clauses or should_clauses or must_not_clauses: + filter_obj = {} + if must_clauses: + filter_obj["must"] = must_clauses + if should_clauses: + filter_obj["should"] = should_clauses + if must_not_clauses: + filter_obj["must_not"] = must_not_clauses + payload["filter"] = filter_obj + + resp = requests.post(url, json=payload, timeout=30) + resp.raise_for_status() + return resp.json().get("result", []) + + def _boost_transcripts(self, results: list, factor: float = 1.10) -> list: + """Boost transcript source scores to surface video content alongside documents.""" + for r in results: + p = r.get("payload", {}) + if p.get("source_type") == "transcript": + r["score"] = r.get("score", 0) * factor + return results + + def _fetch_guaranteed_transcripts(self, vector: list, domain: str = "Medical", limit: int = 3, exclude_subdomains: Optional[list] = None) -> list: + """Fetch top transcript results for a domain regardless of score threshold.""" + url = f"{self.valves.qdrant_url}/collections/{self.valves.collection}/points/search" + filter_obj = { + "must": [ + {"key": "source_type", "match": {"value": "transcript"}}, + {"key": "domain", "match": {"value": domain}}, + ], + } + if exclude_subdomains: + filter_obj["must_not"] = [ + {"key": "subdomain", "match": {"value": sd}} for sd in exclude_subdomains + ] + payload = { + "vector": vector, + "limit": limit, + "with_payload": True, + "filter": filter_obj, + } + try: + resp = requests.post(url, json=payload, timeout=10) + resp.raise_for_status() + return resp.json().get("result", []) + except Exception as e: + log.warning(f"Guaranteed transcript fetch failed: {e}") + return [] + + def _expand_query_ollama(self, query: str) -> list[str]: + """Generate alternative search terms via Ollama. Cached, 10s timeout, fail-safe.""" + if query in self._expansion_cache: + return self._expansion_cache[query] + try: + resp = requests.post( + "http://100.64.0.14:11434/api/generate", + json={ + "model": "goekdenizguelmez/JOSIEFIED-Qwen3:8b", + "prompt": ( + f'Given this search query for a military/survival/preparedness knowledge base: "{query}"\n' + "Generate 3 specific technical search terms that would find TCCC, tactical medicine, " + "or field craft content. Focus on specific procedures, equipment, or doctrine terms " + "— not generic descriptions. Return only the terms, one per line, no numbering, no explanations." + ), + "stream": False, + }, + timeout=10, + ) + resp.raise_for_status() + text = resp.json().get("response", "") + terms = [ + t for t in ( + line.strip().lstrip("0123456789.-)*# ") + for line in text.strip().split("\n") + if line.strip() + ) + if t and len(t) >= 3 + ][:3] + self._expansion_cache[query] = terms + log.info(f"Query expansion: {query!r} → {terms}") + return terms + except Exception as e: + log.warning(f"Query expansion failed (proceeding without): {e}") + self._expansion_cache[query] = [] + return [] + + def _search_expanded_terms( + self, + terms: list[str], + intent_types: Optional[list], + limit: int, + exclude_subdomains: Optional[list] = None, + ) -> list: + """Embed and search expanded query terms in parallel.""" + if not terms: + return [] + + def embed_and_search(term: str) -> list: + vec = self._embed_query(term) + return self._search_qdrant(vec, limit, knowledge_types=intent_types, exclude_subdomains=exclude_subdomains) + + results = [] + with ThreadPoolExecutor(max_workers=min(len(terms), 3)) as pool: + futures = {pool.submit(embed_and_search, t): t for t in terms} + for future in as_completed(futures): + term = futures[future] + try: + results.extend(future.result()) + except Exception as e: + log.warning(f"Expanded search for {term!r} failed: {e}") + return results + + def _format_context(self, results: list, tier_tag: str = "DOMAIN_KNOWLEDGE") -> str: + """Format search results into a context block for the system prompt.""" + if not results: + return "" + + blocks = [] + for i, r in enumerate(results, 1): + p = r.get("payload", {}) + score = r.get("score", 0) + + # Build citation line + book = p.get("book_title") or p.get("filename", "Unknown") + page = p.get("page_ref", "") + if page: + page_str = str(page) + if not page_str.startswith("p"): + page_str = f"p. {page_str}" + citation = f"{book}, {page_str}" + else: + citation = book + + # Summary or truncated content + summary = p.get("summary", "") + if not summary: + content = p.get("content", "") + summary = content[:500] + "..." if len(content) > 500 else content + + # Key facts + key_facts = p.get("key_facts", []) + facts_str = "" + if key_facts and isinstance(key_facts, list): + facts_str = "\nKey facts: " + "; ".join(str(f) for f in key_facts[:5]) + + # Domain + domains = p.get("domain", []) + subdomains = p.get("subdomain", []) + domain_str = "" + if domains: + d = ", ".join(domains) if isinstance(domains, list) else str(domains) + if subdomains: + s = ", ".join(subdomains) if isinstance(subdomains, list) else str(subdomains) + domain_str = f"\nDomain: {d} > {s}" + else: + domain_str = f"\nDomain: {d}" + + # Download URL + dl = p.get("download_url", "") + source_type = p.get("source_type", "document") + if dl: + if source_type == "transcript": + dl_str = f"\nSource Video: {dl}" + elif source_type == "web": + dl_str = f"\nSource URL: {dl}" + else: + dl_str = f"\nSource PDF: {dl}" + else: + dl_str = "" + + block = f"[{tier_tag}:{i}] {citation} (relevance: {score:.2f})\n{summary}{facts_str}{domain_str}{dl_str}" + blocks.append(block) + + return "\n\n".join(blocks) + + def _format_kiwix_context(self, results: list[dict]) -> str: + """Format Kiwix search results into a context block.""" + if not results: + return "" + + blocks = [] + for i, r in enumerate(results, 1): + title = r.get("title", "Unknown") + snippet = r.get("snippet", "").strip() + book = r.get("book", "") + url_path = r.get("url", "") + + # Build wiki URL + if url_path: + # Extract article path from /content/book/path + path_match = re.search(r'/content/[^/]+/(.+)$', url_path) + if path_match: + article_path = path_match.group(1) + wiki_url = f"https://wiki.echo6.co/viewer#{book}/{article_path}" + else: + wiki_url = f"https://wiki.echo6.co/viewer#{book}" + else: + wiki_url = "" + + # Fetch article content if available + content = "" + if url_path: + content = _fetch_kiwix_article(url_path) + if content: + content = content[:1500] # Limit per article + + if not content: + content = snippet + + block = f"[OFFLINE_WIKI:{i}] {title}\n{content}" + if wiki_url: + block += f"\nSource: {wiki_url}" + blocks.append(block) + + return "\n\n".join(blocks) + + def _format_searxng_context(self, results: list[dict]) -> str: + """Format SearXNG search results into a context block.""" + if not results: + return "" + + blocks = [] + for i, r in enumerate(results, 1): + title = r.get("title", "Unknown") + snippet = r.get("snippet", "") + url = r.get("url", "") + engines = r.get("engines", []) + + engine_str = f" (via {', '.join(engines[:2])})" if engines else "" + + block = f"[WEB_SEARCH:{i}] {title}{engine_str}\n{snippet}" + if url: + block += f"\nSource: {url}" + blocks.append(block) + + return "\n\n".join(blocks) + + async def inlet( + self, + body: dict, + __user__: Optional[dict] = None, + __event_emitter__: Callable[[dict], Awaitable[None]] = None, + ) -> dict: + if not self.valves.enabled: + return body + + # Get the latest user message + messages = body.get("messages", []) + user_messages = [m for m in messages if m.get("role") == "user"] + if not user_messages: + return body + + query = user_messages[-1].get("content", "") + if not query or len(query.strip()) < 3: + return body + + router_intent = "rag_search" + + # ── ROUTER GATE (v4.3.0) ───────────────────────────────────────── + if self.valves.router_enabled: + route, confidence = _classify_query( + query, self.valves.tei_url, self.valves.router_threshold + ) + router_intent = route + log.info(f"Router: {query!r} → {route} ({confidence:.3f})") + + if route == "direct_answer": + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Direct response", "done": True}} + ) + return body + + if route == "nav_route": + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Getting directions...", "done": False}} + ) + result = _handle_nav_route( + query, + self.valves.photon_url, + self.valves.valhalla_url, + "Buhl, Idaho", + self.valves.address_book_url, + ) + if result: + _inject_nav_context(body, result) + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Directions ready", "done": True}} + ) + return body + # Fall through to RAG if nav handling fails + + if route == "nav_reverse_geocode": + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Looking up location...", "done": False}} + ) + result = _handle_reverse_geocode(query, self.valves.photon_url) + if result: + _inject_nav_context(body, result) + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Location found", "done": True}} + ) + return body + # Fall through to RAG if reverse geocode fails + + # route == "rag_search" or nav fallthrough → continue existing pipeline + + # ── EXISTING RAG PIPELINE ───────────────────────────────────────── + # Emit status + if __event_emitter__: + await __event_emitter__( + { + "type": "status", + "data": { + "description": "Searching RECON knowledge base...", + "done": False, + }, + } + ) + + tier_used = 1 + top_1_score = 0.0 + final_context = "" + final_results = [] + + try: + vector = self._embed_query(query) + + # Detect intent (knowledge_type filter) + intent_types = self._detect_intent(query) + + # Exclude obstetric/midwifery content when tactical context detected + exclude_subs = _OBSTETRIC_SUBDOMAINS if _TACTICAL_RE.search(query) else None + + # Start query expansion in background (runs concurrently with main search) + expansion_executor = ThreadPoolExecutor(max_workers=1) + expansion_future = expansion_executor.submit(self._expand_query_ollama, query) + + # Search Qdrant — unfiltered semantic search, optionally narrowed by knowledge_type + pool_size = self.valves.candidate_limit + if intent_types: + results = self._search_qdrant(vector, pool_size, knowledge_types=intent_types, + exclude_subdomains=exclude_subs) + if len(results) < self.valves.fallback_min: + results = self._search_qdrant(vector, pool_size, exclude_subdomains=exclude_subs) + else: + results = self._search_qdrant(vector, pool_size, exclude_subdomains=exclude_subs) + + # Collect expansion results and merge with main search + try: + expanded_terms = expansion_future.result(timeout=12) + except Exception: + expanded_terms = [] + expansion_executor.shutdown(wait=False) + + if expanded_terms: + expanded_results = self._search_expanded_terms( + expanded_terms, intent_types, pool_size, + exclude_subdomains=exclude_subs, + ) + if expanded_results: + combined = list(results) + expanded_results + seen: dict[str, dict] = {} + for r in combined: + pid = str(r.get("id", "")) + if pid not in seen or (r.get("score") or 0) > (seen[pid].get("score") or 0): + seen[pid] = r + results = sorted(seen.values(), key=lambda x: -(x.get("score") or 0)) + + # Guaranteed transcript inclusion for medical queries + if _TACTICAL_RE.search(query) or any( + kw in query.lower() for kw in ("medical", "medicine", "wound", "trauma", "tourniquet", + "hemorrhage", "bleeding", "fracture", "burn", "cpr", + "first aid", "triage", "casualty") + ): + transcript_results = self._fetch_guaranteed_transcripts(vector, domain="Medical", limit=3, exclude_subdomains=exclude_subs) + if transcript_results: + combined = list(results) + transcript_results + seen: dict[str, dict] = {} + for r in combined: + pid = str(r.get("id", "")) + if pid not in seen or (r.get("score") or 0) > (seen[pid].get("score") or 0): + seen[pid] = r + results = sorted(seen.values(), key=lambda x: -(x.get("score") or 0)) + + # Boost transcript sources across all retrieval paths + results = self._boost_transcripts(results) + + # Neural reranking via FlashRank, then MMR diversity selection + try: + results = self._rerank_flashrank(query, results) + results = self._mmr_select(results, self.valves.top_k) + except Exception as e: + log.warning(f"FlashRank reranking failed, falling back to keyword overlap: {e}") + results = _rerank_by_keyword_overlap(query, results) + results = results[:self.valves.top_k] + + # Get top-1 score for cascade decision + top_1_score = results[0]["score"] if results else 0.0 + + # ── CASCADE DECISION POINT (v5.0.0) ────────────────────────────── + if self.valves.cascade_enabled and top_1_score < self.valves.cascade_threshold: + # Tier 1 score too low, try Tier 2 (Kiwix) + log.info(f"Cascade: Tier 1 score {top_1_score:.3f} < {self.valves.cascade_threshold}, trying Kiwix") + + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Searching offline encyclopedia...", "done": False}} + ) + + kiwix_results = _search_kiwix(query, self._get_kiwix_books()) + + if kiwix_results: + tier_used = 2 + final_context = self._format_kiwix_context(kiwix_results[:KIWIX_MAX_RESULTS]) + log.info(f"Cascade: Tier 2 (Kiwix) returned {len(kiwix_results)} results") + else: + # Tier 2 failed, try Tier 3 (SearXNG) + log.info("Cascade: Tier 2 empty, trying SearXNG") + + if __event_emitter__: + await __event_emitter__( + {"type": "status", "data": {"description": "Searching the web...", "done": False}} + ) + + searxng_results = _search_searxng(query) + + if searxng_results: + tier_used = 3 + final_context = self._format_searxng_context(searxng_results) + log.info(f"Cascade: Tier 3 (SearXNG) returned {len(searxng_results)} results") + else: + # All tiers exhausted, fall back to whatever Tier 1 had + log.info("Cascade: All tiers exhausted, using Tier 1 results") + tier_used = 1 + final_context = self._format_context(results, "DOMAIN_KNOWLEDGE") + final_results = results + else: + # Tier 1 score good enough, use Qdrant results + tier_used = 1 + final_context = self._format_context(results, "DOMAIN_KNOWLEDGE") + final_results = results + + # Store results for outlet citations (only for Tier 1) + if tier_used == 1: + chat_id = body.get("chat_id", body.get("metadata", {}).get("chat_id", "")) + if chat_id: + _SOURCE_STORE[chat_id] = final_results + + # Log cascade decision + _log_cascade_decision( + query=query, + router_intent=router_intent, + top_1_score=top_1_score, + tier_used=tier_used, + num_results=len(results) if tier_used == 1 else (len(kiwix_results) if tier_used == 2 else len(searxng_results) if tier_used == 3 else 0), + ) + + # Build the RAG prompt with tier-appropriate instructions + if final_context: + if tier_used == 1: + rag_prompt = ( + "You have access to the RECON knowledge base — a curated library of military field manuals, " + "survival guides, preparedness literature, and video transcripts. Answer the user's question using " + "the reference material below. Reference sources using [DOMAIN_KNOWLEDGE:1], [DOMAIN_KNOWLEDGE:2], etc.\n\n" + "If the reference material doesn't adequately answer the question, say so explicitly rather " + "than filling gaps with general knowledge.\n\n" + "---REFERENCE MATERIAL---\n\n" + f"{final_context}\n\n" + "---END REFERENCE MATERIAL---" + ) + elif tier_used == 2: + rag_prompt = ( + "The RECON domain knowledge base did not have high-confidence results for this query. " + "The following information comes from offline Wikipedia/encyclopedia sources (Kiwix). " + "Reference sources using [OFFLINE_WIKI:1], [OFFLINE_WIKI:2], etc.\n\n" + "Note: This is general encyclopedia content, not domain-specific preparedness material.\n\n" + "---OFFLINE WIKI CONTENT---\n\n" + f"{final_context}\n\n" + "---END OFFLINE WIKI CONTENT---" + ) + else: # tier_used == 3 + rag_prompt = ( + "Neither the RECON knowledge base nor offline encyclopedias had relevant content. " + "The following information comes from a live web search. Reference sources using [WEB_SEARCH:1], etc.\n\n" + "Note: Web search results may be less reliable than curated sources. Verify important information.\n\n" + "---WEB SEARCH RESULTS---\n\n" + f"{final_context}\n\n" + "---END WEB SEARCH RESULTS---" + ) + else: + rag_prompt = ( + "You have access to the RECON knowledge base, but no relevant reference material was " + "found for this query in any tier (domain knowledge, offline wiki, or web search). " + "Answer from your general knowledge and clearly flag that your response is NOT backed by references." + ) + + # Add source priority instruction + rag_prompt += ( + "\n\nSource priority: When sources overlap, prefer DOMAIN_KNOWLEDGE over OFFLINE_WIKI over WEB_SEARCH. " + "Always cite which tier your information came from." + ) + + # Inject into system message + system_msg = next( + (m for m in messages if m.get("role") == "system"), None + ) + if system_msg: + system_msg["content"] = system_msg["content"] + "\n\n" + rag_prompt + else: + body["messages"].insert( + 0, {"role": "system", "content": rag_prompt} + ) + + # Emit final status + if __event_emitter__: + tier_names = {1: "RECON", 2: "Kiwix", 3: "Web"} + status_msg = f"Found results from {tier_names.get(tier_used, 'unknown')} (Tier {tier_used})" + await __event_emitter__( + { + "type": "status", + "data": { + "description": status_msg, + "done": True, + }, + } + ) + + except Exception as e: + log.warning(f"RECON RAG search failed: {e}") + if __event_emitter__: + await __event_emitter__( + { + "type": "status", + "data": { + "description": "RECON search unavailable, proceeding without references", + "done": True, + }, + } + ) + + return body + + async def outlet( + self, + body: dict, + __user__: Optional[dict] = None, + __event_emitter__: Callable[[dict], Awaitable[None]] = None, + ) -> dict: + if not self.valves.enabled or not __event_emitter__: + return body + + # Retrieve sources from module-level store (survives instance recreation) + chat_id = body.get("chat_id", "") + sources = _SOURCE_STORE.pop(chat_id, []) + if not sources: + return body + + # Emit citations for each source used + for r in sources: + try: + if not isinstance(r, dict): + continue + p = r.get("payload") or {} + if not isinstance(p, dict): + p = {} + + # Build citation — every field defensively None-checked + book = p.get("book_title") or p.get("filename") or "Unknown Source" + page = p.get("page_ref") + if page is not None and str(page).strip(): + page_str = str(page).strip() + if not page_str.startswith("p"): + page_str = f"p. {page_str}" + citation_name = f"{book}, {page_str}" + else: + citation_name = str(book) + + download_url = str(p.get("download_url") or "") + + # Safe summary extraction — handle None/missing without raising + summary = str(p.get("summary") or "") + if not summary: + content = str(p.get("content") or "") + summary = content[:300] if content else "" + + # Safe score formatting + score = r.get("score") + try: + relevance = f"{float(score):.2f}" + except (TypeError, ValueError): + relevance = "0.00" + + author = str(p.get("book_author") or "") + + await __event_emitter__( + { + "type": "source", + "data": { + "document": [summary], + "metadata": [ + { + "source": citation_name, + "url": download_url, + "author": author, + "relevance": relevance, + } + ], + "source": { + "name": citation_name, + "url": download_url, + }, + }, + } + ) + except Exception as e: + pid = r.get("id", "?") if isinstance(r, dict) else "?" + log.warning(f"Failed to emit citation (id={pid}): {e}") + + return body + + +# ── TEST BLOCK ─────────────────────────────────────────────────────────────── +if __name__ == "__main__": + import asyncio + + # Test queries for each tier + TEST_QUERIES = [ + ("tourniquet application steps", "Should hit Tier 1 (RECON)"), + ("population of Ukraine", "Should hit Tier 2 (Kiwix)"), + ("history of the Winter War between Finland and Russia", "Should hit Tier 2 (Kiwix)"), + ("latest iPhone reviews 2026", "Should hit Tier 3 (SearXNG)"), + ("compass declination adjustment", "Should hit Tier 1 (RECON)"), + ("what is the Coriolis effect", "Could go either way"), + ] + + async def run_tests(): + f = Filter() + results = [] + + print("=" * 70) + print("CASCADE TEST RESULTS") + print("=" * 70) + + for query, expected in TEST_QUERIES: + print(f"\n{'─' * 70}") + print(f"Query: {query}") + print(f"Expected: {expected}") + print("─" * 70) + + # Simulate a request body + body = { + "messages": [ + {"role": "user", "content": query} + ], + "chat_id": f"test_{hash(query)}", + } + + try: + # Run through inlet + result_body = await f.inlet(body) + + # Extract what was injected + system_msg = next( + (m for m in result_body.get("messages", []) if m.get("role") == "system"), + None + ) + + if system_msg: + content = system_msg.get("content", "") + + # Determine tier used + if "[DOMAIN_KNOWLEDGE:" in content: + tier = 1 + elif "[OFFLINE_WIKI:" in content: + tier = 2 + elif "[WEB_SEARCH:" in content: + tier = 3 + else: + tier = 0 + + print(f"Tier Used: {tier}") + + # Get first 200 chars of context + context_start = content.find("---") + if context_start > 0: + context_preview = content[context_start:context_start+300] + print(f"Context Preview: {context_preview[:200]}...") + + results.append({ + "query": query, + "expected": expected, + "tier": tier, + }) + else: + print("No system message injected") + results.append({ + "query": query, + "expected": expected, + "tier": None, + }) + + except Exception as e: + print(f"ERROR: {e}") + results.append({ + "query": query, + "expected": expected, + "tier": None, + "error": str(e), + }) + + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + for r in results: + tier_str = f"Tier {r['tier']}" if r.get('tier') else "ERROR" + print(f" {r['query'][:40]:<40} → {tier_str}") + + return results + + asyncio.run(run_tests()) diff --git a/tools/recon_rag_tool_v4.3.0_export.json b/tools/recon_rag_tool_v4.3.0_export.json new file mode 100644 index 0000000..eea94f3 --- /dev/null +++ b/tools/recon_rag_tool_v4.3.0_export.json @@ -0,0 +1,13 @@ +{ + "id": "recon_rag", + "user_id": "421c0f8b-6ae4-4063-b07a-7e08b36851f1", + "name": "RECON Knowledge Base", + "type": "filter", + "content": "\"\"\"\ntitle: RECON Knowledge Base\nauthor: Echo6\nversion: 4.3.0\ndescription: RAG filter that searches the RECON knowledge base and injects reference material into Aurora's context. Emits citations with PDF download links. Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.\n\"\"\"\n\nimport logging\nimport json\nimport math\nimport re\nimport threading\nfrom typing import Optional, Callable, Awaitable\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nimport requests\nfrom pydantic import BaseModel, Field\n\nlog = logging.getLogger(__name__)\n\n# Module-level source store: keyed by chat_id so inlet/outlet share state\n# even if OWI instantiates separate Filter objects per call.\n_SOURCE_STORE: dict[str, list] = {}\n\n# \u2500\u2500 Semantic Query Router (v4.3.0) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nROUTE_EXAMPLES = {\n \"nav_route\": [\n \"how do I get to Boise\",\n \"directions to Twin Falls\",\n \"how do I get from Buhl to Boise\",\n \"drive from Jerome to Sun Valley\",\n \"route from Boise to McCall\",\n \"what's the fastest way to Sun Valley\",\n \"how far is it to Twin Falls\",\n \"take me to Shoshone\",\n \"navigate to the airport\",\n \"how do I drive to Salt Lake City\",\n \"walking directions to the park\",\n \"bike route to downtown\",\n ],\n \"nav_reverse_geocode\": [\n \"what town is at 42.5, -114.7\",\n \"where am I right now\",\n \"what is at coordinates 43.6, -116.2\",\n \"what location is 42.574, -114.607\",\n \"where is this place 44.0, -114.3\",\n \"what city is near 42.7, -114.5\",\n \"reverse geocode 43.0, -115.0\",\n \"what's at this location 42.9, -114.8\",\n ],\n \"direct_answer\": [\n \"hello\",\n \"hey aurora\",\n \"good morning\",\n \"thanks\",\n \"thank you\",\n \"what's your name\",\n \"who are you\",\n \"tell me a joke\",\n \"how are you\",\n \"hi there\",\n ],\n \"rag_search\": [\n \"what does the survival manual say about water\",\n \"how to purify water in the field\",\n \"how to treat a gunshot wound\",\n \"what is the ranger handbook chapter on patrolling\",\n \"field manual water purification\",\n \"how to build a shelter in the wilderness\",\n \"tactical combat casualty care procedures\",\n \"what does FM 21-76 say about fire starting\",\n ],\n}\n\n_ROUTE_CENTROIDS: dict | None = None\n_ROUTER_LOCK = threading.Lock()\n\n\ndef _embed_batch_router(texts: list[str], tei_url: str) -> list[list[float]]:\n resp = requests.post(tei_url, json={\"inputs\": texts}, timeout=30)\n resp.raise_for_status()\n return resp.json()\n\n\ndef _compute_centroid(vectors: list[list[float]]) -> list[float]:\n n = len(vectors)\n dim = len(vectors[0])\n centroid = [0.0] * dim\n for vec in vectors:\n for i in range(dim):\n centroid[i] += vec[i]\n for i in range(dim):\n centroid[i] /= n\n return centroid\n\n\ndef _cosine_similarity(a: list[float], b: list[float]) -> float:\n dot = 0.0\n norm_a = 0.0\n norm_b = 0.0\n for i in range(len(a)):\n dot += a[i] * b[i]\n norm_a += a[i] * a[i]\n norm_b += b[i] * b[i]\n denom = math.sqrt(norm_a) * math.sqrt(norm_b)\n if denom == 0:\n return 0.0\n return dot / denom\n\n\ndef _ensure_centroids(tei_url: str) -> dict[str, list[float]]:\n global _ROUTE_CENTROIDS\n if _ROUTE_CENTROIDS is not None:\n return _ROUTE_CENTROIDS\n with _ROUTER_LOCK:\n if _ROUTE_CENTROIDS is not None:\n return _ROUTE_CENTROIDS\n all_texts = []\n route_ranges: dict[str, tuple[int, int]] = {}\n offset = 0\n for route, examples in ROUTE_EXAMPLES.items():\n route_ranges[route] = (offset, offset + len(examples))\n all_texts.extend(examples)\n offset += len(examples)\n all_vectors = _embed_batch_router(all_texts, tei_url)\n centroids = {}\n for route, (start, end) in route_ranges.items():\n centroids[route] = _compute_centroid(all_vectors[start:end])\n _ROUTE_CENTROIDS = centroids\n return _ROUTE_CENTROIDS\n\n\ndef _classify_query(\n query: str,\n tei_url: str,\n threshold: float = 0.45,\n) -> tuple[str, float]:\n \"\"\"Classify query intent. Returns (\"rag_search\", 0.0) on any failure.\"\"\"\n try:\n centroids = _ensure_centroids(tei_url)\n vecs = _embed_batch_router([query], tei_url)\n query_vec = vecs[0]\n best_route = \"rag_search\"\n best_score = 0.0\n for route, centroid in centroids.items():\n sim = _cosine_similarity(query_vec, centroid)\n if sim > best_score:\n best_score = sim\n best_route = route\n if best_score < threshold:\n return (\"rag_search\", best_score)\n return (best_route, best_score)\n except Exception as e:\n log.warning(f\"Router classification failed: {e}\")\n return (\"rag_search\", 0.0)\n\n\n# \u2500\u2500 Navigation handlers (v4.3.0) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n_COORD_RE = re.compile(r'^(-?\\d+\\.?\\d*)\\s*,\\s*(-?\\d+\\.?\\d*)$')\n_FROM_TO_RE = re.compile(r'from\\s+(.+?)\\s+to\\s+(.+?)(?:\\s+by\\s+\\w+)?$', re.IGNORECASE)\n_TO_RE = re.compile(r'(?:to|towards?)\\s+(?:the\\s+)?(.+?)$', re.IGNORECASE)\n_COORD_IN_TEXT_RE = re.compile(r'(-?\\d+\\.?\\d+)\\s*,\\s*(-?\\d+\\.?\\d+)')\n_MODE_MAP = {\n \"walk\": \"pedestrian\", \"walking\": \"pedestrian\", \"foot\": \"pedestrian\", \"pedestrian\": \"pedestrian\",\n \"bike\": \"bicycle\", \"cycling\": \"bicycle\", \"bicycle\": \"bicycle\", \"cycle\": \"bicycle\",\n \"truck\": \"truck\", \"lorry\": \"truck\",\n \"drive\": \"auto\", \"driving\": \"auto\", \"car\": \"auto\", \"auto\": \"auto\",\n}\n\n\ndef _detect_mode(query: str) -> str:\n q = query.lower()\n for keyword, mode in _MODE_MAP.items():\n if keyword in q:\n return mode\n return \"auto\"\n\n\ndef _clean_place(text: str) -> str:\n \"\"\"Clean a place string for geocoding: strip articles, punctuation, normalize 'in' to comma.\"\"\"\n s = text.strip().rstrip('?.,!')\n # Strip leading articles\n s = re.sub(r'^(the|a|an)\\s+', '', s, flags=re.IGNORECASE)\n # \"214 North St in Filer ID\" \u2192 \"214 North St, Filer, ID\"\n s = re.sub(r'\\s+in\\s+', ', ', s, count=1, flags=re.IGNORECASE)\n return s.strip()\n\n\ndef _parse_nav_query(query: str) -> tuple[str, str, str] | None:\n mode = _detect_mode(query)\n m = _FROM_TO_RE.search(query)\n if m:\n return (_clean_place(m.group(1)), _clean_place(m.group(2)), mode)\n m = _TO_RE.search(query)\n if m:\n dest = _clean_place(m.group(1))\n if dest:\n return (None, dest, mode)\n return None\n\n\ndef _geocode(query: str, photon_url: str, address_book_url: str = \"\") -> tuple[float, float, str] | tuple[None, None, None]:\n m = _COORD_RE.match(query.strip())\n if m:\n lat, lon = float(m.group(1)), float(m.group(2))\n return lat, lon, query\n # Address book lookup (before Photon)\n ab = _address_book_lookup(query, address_book_url)\n if ab:\n return ab['lat'], ab['lon'], ab.get('address') or ab['name']\n resp = requests.get(\n f\"{photon_url}/api\",\n params={\"q\": query, \"limit\": 1},\n timeout=10,\n )\n resp.raise_for_status()\n features = resp.json().get(\"features\", [])\n if not features:\n return None, None, None\n props = features[0][\"properties\"]\n coords = features[0][\"geometry\"][\"coordinates\"]\n parts = [props.get(\"name\", \"\")]\n for key in (\"city\", \"state\", \"country\"):\n v = props.get(key)\n if v and v != parts[-1]:\n parts.append(v)\n return coords[1], coords[0], \", \".join(p for p in parts if p)\n\n\ndef _route_valhalla(\n orig: tuple[float, float],\n dest: tuple[float, float],\n mode: str,\n valhalla_url: str,\n) -> str | None:\n try:\n resp = requests.post(\n f\"{valhalla_url}/route\",\n json={\n \"locations\": [\n {\"lat\": orig[0], \"lon\": orig[1]},\n {\"lat\": dest[0], \"lon\": dest[1]},\n ],\n \"costing\": mode,\n \"directions_options\": {\"units\": \"miles\"},\n },\n timeout=30,\n )\n except requests.RequestException:\n return None\n if resp.status_code != 200:\n return None\n trip = resp.json()[\"trip\"]\n summary = trip[\"summary\"]\n legs = trip[\"legs\"][0][\"maneuvers\"]\n miles = round(summary[\"length\"], 1)\n minutes = round(summary[\"time\"] / 60, 1)\n lines = [f\"Distance: {miles} miles | Time: {minutes} minutes\", \"\"]\n for i, m in enumerate(legs, 1):\n inst = m[\"instruction\"]\n dist = m.get(\"length\", 0)\n if dist > 0:\n lines.append(f\"{i}. {inst} \u2014 {round(dist, 1)} mi\")\n else:\n lines.append(f\"{i}. {inst}\")\n return \"\\n\".join(lines)\n\n\ndef _handle_nav_route(\n query: str,\n photon_url: str,\n valhalla_url: str,\n default_origin: str,\n address_book_url: str = \"\",\n) -> str | None:\n parsed = _parse_nav_query(query)\n if not parsed:\n return None\n origin_str, dest_str, mode = parsed\n if not origin_str:\n origin_str = default_origin\n orig_lat, orig_lon, orig_name = _geocode(origin_str, photon_url, address_book_url)\n if orig_lat is None:\n return None\n dest_lat, dest_lon, dest_name = _geocode(dest_str, photon_url, address_book_url)\n if dest_lat is None:\n return None\n directions = _route_valhalla(\n (orig_lat, orig_lon), (dest_lat, dest_lon), mode, valhalla_url\n )\n if not directions:\n return None\n return f\"Directions from {orig_name} to {dest_name} ({mode}):\\n{directions}\"\n\n\ndef _handle_reverse_geocode(query: str, photon_url: str) -> str | None:\n m = _COORD_IN_TEXT_RE.search(query)\n if not m:\n return None\n lat, lon = float(m.group(1)), float(m.group(2))\n try:\n resp = requests.get(\n f\"{photon_url}/reverse\",\n params={\"lat\": lat, \"lon\": lon, \"limit\": 1},\n timeout=10,\n )\n resp.raise_for_status()\n features = resp.json().get(\"features\", [])\n if not features:\n return f\"No location found near coordinates ({lat}, {lon})\"\n props = features[0][\"properties\"]\n parts = []\n for key in (\"name\", \"city\", \"state\", \"country\"):\n v = props.get(key)\n if v and v not in parts:\n parts.append(v)\n display = \", \".join(parts) if parts else \"Unknown location\"\n return f\"Location: {display} ({lat}, {lon})\"\n except Exception:\n return None\n\n\ndef _inject_nav_context(body: dict, context: str):\n messages = body.get(\"messages\", [])\n nav_block = (\n \"\\n\\n---NAVIGATION RESULT---\\n\\n\"\n f\"{context}\\n\\n\"\n \"---END NAVIGATION RESULT---\\n\\n\"\n \"Present these directions to the user exactly as provided. \"\n \"Do not summarize or omit steps. You may add brief contextual notes.\"\n )\n system_msg = next((m for m in messages if m.get(\"role\") == \"system\"), None)\n if system_msg:\n system_msg[\"content\"] = system_msg[\"content\"] + nav_block\n else:\n body[\"messages\"].insert(0, {\"role\": \"system\", \"content\": nav_block})\n\n\n\ndef _address_book_lookup(query: str, address_book_url: str) -> dict | None:\n \"\"\"Check RECON address book for exact place match. Returns dict with lat/lon or None.\"\"\"\n if not address_book_url:\n return None\n try:\n resp = requests.get(\n f\"{address_book_url}/api/address_book/lookup\",\n params={\"q\": query},\n timeout=2,\n )\n if resp.status_code == 200:\n data = resp.json()\n if data.get(\"confidence\") == \"exact\" and data.get(\"lat\") and data.get(\"lon\"):\n log.info(f\"Address book hit: {query!r} \u2192 {data['name']} ({data['lat']}, {data['lon']})\")\n return data\n return None\n except Exception:\n return None\n\n\n# \u2500\u2500 End router/nav code \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Subdomains excluded from Medical results when tactical context detected\n_OBSTETRIC_SUBDOMAINS = [\n \"Obstetrics\", \"Midwifery\", \"Pregnancy\", \"Pregnancy Care\",\n \"High-Risk Pregnancy\", \"Childbirth\", \"Postpartum Care\",\n \"Family Planning\", \"Contraception\", \"Breastfeeding\",\n \"Labor Complications\", \"Twin Delivery\",\n]\n\n# Query intent patterns \u2014 compiled once at import time\n_PROCEDURAL_RE = re.compile(\n r\"^(how\\s+(do|can|should|would|to)\\b|steps?\\s+(to|for)\\b|procedure\\s+for\\b|technique\\s+for\\b|way\\s+to\\b|method\\s+(to|for)\\b|guide\\s+(to|for)\\b|instructions?\\s+for\\b)\",\n re.IGNORECASE,\n)\n_FOUNDATIONAL_RE = re.compile(\n r\"^(what\\s+(is|are|does|was|were)\\b|explain\\b|define\\b|why\\s+(does|do|is|are|did)\\b|describe\\b|meaning\\s+of\\b|difference\\s+between\\b)\",\n re.IGNORECASE,\n)\n\n# Tactical keyword patterns for obstetric subdomain exclusion\n_TACTICAL_RE = re.compile(\n r\"\\b(tactical|combat|tccc|casevac|medevac|casualty|triage|tourniquet|hemorrhage|wound packing|chest seal|care under fire|point of injury|far forward|buddy aid|self aid|field care|9-line|march algorithm)\\b\",\n re.IGNORECASE,\n)\n\n\ndef _rerank_by_keyword_overlap(query: str, results: list) -> list:\n \"\"\"Rerank results by boosting those with query term overlap in content/summary/key_facts.\n\n Adds a boost of up to 0.15 based on the fraction of query tokens found in the result text.\n Results are re-sorted by boosted score.\n \"\"\"\n q_tokens = set(re.findall(r'[a-z0-9][-a-z0-9]{2,}', query.lower()))\n if not q_tokens:\n return results\n\n reranked = []\n for r in results:\n p = r.get(\"payload\", {})\n score = r.get(\"score\", 0)\n\n # Build searchable text from content, summary, and key_facts\n parts = []\n content = p.get(\"content\", \"\")\n if content:\n parts.append(content[:2000].lower())\n summary = p.get(\"summary\", \"\")\n if summary:\n parts.append(summary.lower())\n key_facts = p.get(\"key_facts\", [])\n if isinstance(key_facts, list):\n parts.append(\" \".join(str(f) for f in key_facts).lower())\n searchable = \" \".join(parts)\n\n # Count how many query tokens appear in the result\n if searchable:\n matches = sum(1 for t in q_tokens if t in searchable)\n overlap_ratio = matches / len(q_tokens)\n else:\n overlap_ratio = 0\n\n # Boost: up to 0.15 for perfect overlap\n boosted_score = score + (overlap_ratio * 0.15)\n reranked.append({**r, \"score\": boosted_score})\n\n reranked.sort(key=lambda x: -x[\"score\"])\n return reranked\n\n\nclass Filter:\n class Valves(BaseModel):\n tei_url: str = Field(\n default=\"http://100.64.0.14:8090/embed\",\n description=\"TEI embedding endpoint\",\n )\n qdrant_url: str = Field(\n default=\"http://100.64.0.14:6333\",\n description=\"Qdrant REST API base URL\",\n )\n collection: str = Field(\n default=\"recon_knowledge_hybrid\",\n description=\"Qdrant collection name\",\n )\n top_k: int = Field(\n default=8,\n description=\"Number of results to retrieve\",\n )\n score_threshold: float = Field(\n default=0.3,\n description=\"Minimum similarity score to include a result\",\n )\n fallback_min: int = Field(\n default=3,\n description=\"Minimum filtered results before falling back to unfiltered search\",\n )\n candidate_limit: int = Field(\n default=50,\n description=\"Initial retrieval pool size for reranking\",\n )\n rerank_top_n: int = Field(\n default=20,\n description=\"Keep top N after FlashRank reranking\",\n )\n mmr_diversity: float = Field(\n default=0.3,\n description=\"MMR diversity 0-1 (0=pure relevance, 1=max diversity)\",\n )\n enabled: bool = Field(\n default=True,\n description=\"Enable/disable RECON RAG augmentation\",\n )\n priority: int = Field(\n default=0,\n description=\"Filter execution priority (lower = earlier)\",\n )\n router_enabled: bool = Field(\n default=True,\n description=\"Enable semantic query routing\",\n )\n router_threshold: float = Field(\n default=0.45,\n description=\"Min confidence for route classification\",\n )\n photon_url: str = Field(\n default=\"http://100.64.0.24:2322\",\n description=\"Photon geocoder URL\",\n )\n valhalla_url: str = Field(\n default=\"http://100.64.0.24:8002\",\n description=\"Valhalla routing URL\",\n )\n address_book_url: str = Field(\n default=\"http://100.64.0.24:8420\",\n description=\"RECON address book API base URL\",\n )\n\n def __init__(self):\n self.valves = self.Valves()\n self._expansion_cache: dict[str, list[str]] = {}\n self._ranker = None\n\n def _embed_query(self, text: str) -> list:\n \"\"\"Embed a query string using TEI.\"\"\"\n resp = requests.post(\n self.valves.tei_url,\n json={\"inputs\": text},\n timeout=30,\n )\n resp.raise_for_status()\n return resp.json()[0]\n\n def _get_ranker(self):\n \"\"\"Lazy-load FlashRank neural reranker.\"\"\"\n if self._ranker is None:\n from flashrank import Ranker\n self._ranker = Ranker(model_name=\"ms-marco-MiniLM-L-12-v2\", cache_dir=\"/tmp/flashrank\")\n return self._ranker\n\n def _rerank_flashrank(self, query: str, results: list) -> list:\n \"\"\"Rerank results using FlashRank neural reranker.\n\n Takes Qdrant REST API result dicts (with 'payload' and 'score' keys).\n Returns reranked list with updated scores, trimmed to rerank_top_n.\n \"\"\"\n from flashrank import RerankRequest\n\n ranker = self._get_ranker()\n\n passages = []\n for i, r in enumerate(results):\n p = r.get(\"payload\", {})\n text = p.get(\"content\", \"\")\n if not text:\n text = p.get(\"summary\", \"\")\n passages.append({\"id\": i, \"text\": text[:2048]})\n\n if not passages:\n return results\n\n request = RerankRequest(query=query, passages=passages)\n ranked = ranker.rerank(request)\n\n reranked = []\n for item in ranked[:self.valves.rerank_top_n]:\n idx = item[\"id\"]\n result_copy = dict(results[idx])\n result_copy[\"score\"] = item[\"score\"]\n reranked.append(result_copy)\n\n return reranked\n\n def _mmr_select(self, candidates: list, final_k: int) -> list:\n \"\"\"Select final_k results using Maximal Marginal Relevance.\n\n Penalizes redundancy: same book_title (0.6), same domain (0.3), same source_type (0.1).\n Works with Qdrant REST API result dicts.\n \"\"\"\n if len(candidates) <= final_k:\n return candidates\n\n selected = [candidates[0]]\n remaining = list(candidates[1:])\n\n while len(selected) < final_k and remaining:\n best_score = -999\n best_idx = 0\n\n for i, candidate in enumerate(remaining):\n relevance = candidate.get(\"score\", 0)\n cp = candidate.get(\"payload\", {})\n\n max_overlap = 0\n for sel in selected:\n sp = sel.get(\"payload\", {})\n overlap = 0\n\n c_title = cp.get(\"book_title\", \"\")\n s_title = sp.get(\"book_title\", \"\")\n if c_title and s_title and c_title == s_title:\n overlap += 0.6\n\n c_domain = cp.get(\"domain\", \"\")\n s_domain = sp.get(\"domain\", \"\")\n if c_domain and s_domain and c_domain == s_domain:\n overlap += 0.3\n\n c_src = cp.get(\"source_type\", \"\")\n s_src = sp.get(\"source_type\", \"\")\n if c_src and s_src and c_src == s_src:\n overlap += 0.1\n\n max_overlap = max(max_overlap, overlap)\n\n diversity = self.valves.mmr_diversity\n mmr_score = (1 - diversity) * relevance - diversity * max_overlap\n\n if mmr_score > best_score:\n best_score = mmr_score\n best_idx = i\n\n selected.append(remaining.pop(best_idx))\n\n return selected\n\n @staticmethod\n def _detect_intent(query: str) -> Optional[list]:\n \"\"\"Detect query intent and return preferred knowledge_types, or None for unfiltered.\"\"\"\n q = query.strip()\n if _PROCEDURAL_RE.search(q):\n return [\"procedural\", \"operational\"]\n if _FOUNDATIONAL_RE.search(q):\n return [\"foundational\"]\n return None\n\n def _search_qdrant(\n self,\n vector: list,\n limit: int,\n knowledge_types: Optional[list] = None,\n domain: Optional[str] = None,\n exclude_subdomains: Optional[list] = None,\n ) -> list:\n \"\"\"Search Qdrant for similar vectors, optionally filtered by knowledge_type and/or domain.\"\"\"\n url = f\"{self.valves.qdrant_url}/collections/{self.valves.collection}/points/search\"\n payload = {\n \"vector\": vector,\n \"limit\": limit,\n \"with_payload\": True,\n \"score_threshold\": self.valves.score_threshold,\n }\n\n must_clauses = []\n must_not_clauses = []\n should_clauses = []\n\n if domain:\n must_clauses.append({\"key\": \"domain\", \"match\": {\"value\": domain}})\n\n if knowledge_types:\n for kt in knowledge_types:\n should_clauses.append({\"key\": \"knowledge_type\", \"match\": {\"value\": kt}})\n\n if exclude_subdomains:\n for sd in exclude_subdomains:\n must_not_clauses.append({\"key\": \"subdomain\", \"match\": {\"value\": sd}})\n\n if must_clauses or should_clauses or must_not_clauses:\n filter_obj = {}\n if must_clauses:\n filter_obj[\"must\"] = must_clauses\n if should_clauses:\n filter_obj[\"should\"] = should_clauses\n if must_not_clauses:\n filter_obj[\"must_not\"] = must_not_clauses\n payload[\"filter\"] = filter_obj\n\n resp = requests.post(url, json=payload, timeout=30)\n resp.raise_for_status()\n return resp.json().get(\"result\", [])\n\n def _boost_transcripts(self, results: list, factor: float = 1.10) -> list:\n \"\"\"Boost transcript source scores to surface video content alongside documents.\"\"\"\n for r in results:\n p = r.get(\"payload\", {})\n if p.get(\"source_type\") == \"transcript\":\n r[\"score\"] = r.get(\"score\", 0) * factor\n return results\n\n def _fetch_guaranteed_transcripts(self, vector: list, domain: str = \"Medical\", limit: int = 3, exclude_subdomains: Optional[list] = None) -> list:\n \"\"\"Fetch top transcript results for a domain regardless of score threshold.\"\"\"\n url = f\"{self.valves.qdrant_url}/collections/{self.valves.collection}/points/search\"\n filter_obj = {\n \"must\": [\n {\"key\": \"source_type\", \"match\": {\"value\": \"transcript\"}},\n {\"key\": \"domain\", \"match\": {\"value\": domain}},\n ],\n }\n if exclude_subdomains:\n filter_obj[\"must_not\"] = [\n {\"key\": \"subdomain\", \"match\": {\"value\": sd}} for sd in exclude_subdomains\n ]\n payload = {\n \"vector\": vector,\n \"limit\": limit,\n \"with_payload\": True,\n \"filter\": filter_obj,\n }\n try:\n resp = requests.post(url, json=payload, timeout=10)\n resp.raise_for_status()\n return resp.json().get(\"result\", [])\n except Exception as e:\n log.warning(f\"Guaranteed transcript fetch failed: {e}\")\n return []\n\n def _expand_query_ollama(self, query: str) -> list[str]:\n \"\"\"Generate alternative search terms via Ollama. Cached, 10s timeout, fail-safe.\"\"\"\n if query in self._expansion_cache:\n return self._expansion_cache[query]\n try:\n resp = requests.post(\n \"http://100.64.0.14:11434/api/generate\",\n json={\n \"model\": \"goekdenizguelmez/JOSIEFIED-Qwen3:8b\",\n \"prompt\": (\n f'Given this search query for a military/survival/preparedness knowledge base: \"{query}\"\\n'\n \"Generate 3 specific technical search terms that would find TCCC, tactical medicine, \"\n \"or field craft content. Focus on specific procedures, equipment, or doctrine terms \"\n \"\u2014 not generic descriptions. Return only the terms, one per line, no numbering, no explanations.\"\n ),\n \"stream\": False,\n },\n timeout=10,\n )\n resp.raise_for_status()\n text = resp.json().get(\"response\", \"\")\n terms = [\n t for t in (\n line.strip().lstrip(\"0123456789.-)*# \")\n for line in text.strip().split(\"\\n\")\n if line.strip()\n )\n if t and len(t) >= 3\n ][:3]\n self._expansion_cache[query] = terms\n log.info(f\"Query expansion: {query!r} \u2192 {terms}\")\n return terms\n except Exception as e:\n log.warning(f\"Query expansion failed (proceeding without): {e}\")\n self._expansion_cache[query] = []\n return []\n\n def _search_expanded_terms(\n self,\n terms: list[str],\n intent_types: Optional[list],\n limit: int,\n exclude_subdomains: Optional[list] = None,\n ) -> list:\n \"\"\"Embed and search expanded query terms in parallel.\"\"\"\n if not terms:\n return []\n\n def embed_and_search(term: str) -> list:\n vec = self._embed_query(term)\n return self._search_qdrant(vec, limit, knowledge_types=intent_types, exclude_subdomains=exclude_subdomains)\n\n results = []\n with ThreadPoolExecutor(max_workers=min(len(terms), 3)) as pool:\n futures = {pool.submit(embed_and_search, t): t for t in terms}\n for future in as_completed(futures):\n term = futures[future]\n try:\n results.extend(future.result())\n except Exception as e:\n log.warning(f\"Expanded search for {term!r} failed: {e}\")\n return results\n\n def _format_context(self, results: list) -> str:\n \"\"\"Format search results into a context block for the system prompt.\"\"\"\n if not results:\n return \"\"\n\n blocks = []\n for i, r in enumerate(results, 1):\n p = r.get(\"payload\", {})\n score = r.get(\"score\", 0)\n\n # Build citation line\n book = p.get(\"book_title\") or p.get(\"filename\", \"Unknown\")\n page = p.get(\"page_ref\", \"\")\n if page:\n page_str = str(page)\n if not page_str.startswith(\"p\"):\n page_str = f\"p. {page_str}\"\n citation = f\"{book}, {page_str}\"\n else:\n citation = book\n\n # Summary or truncated content\n summary = p.get(\"summary\", \"\")\n if not summary:\n content = p.get(\"content\", \"\")\n summary = content[:500] + \"...\" if len(content) > 500 else content\n\n # Key facts\n key_facts = p.get(\"key_facts\", [])\n facts_str = \"\"\n if key_facts and isinstance(key_facts, list):\n facts_str = \"\\nKey facts: \" + \"; \".join(str(f) for f in key_facts[:5])\n\n # Domain\n domains = p.get(\"domain\", [])\n subdomains = p.get(\"subdomain\", [])\n domain_str = \"\"\n if domains:\n d = \", \".join(domains) if isinstance(domains, list) else str(domains)\n if subdomains:\n s = \", \".join(subdomains) if isinstance(subdomains, list) else str(subdomains)\n domain_str = f\"\\nDomain: {d} > {s}\"\n else:\n domain_str = f\"\\nDomain: {d}\"\n\n # Download URL\n dl = p.get(\"download_url\", \"\")\n source_type = p.get(\"source_type\", \"document\")\n if dl:\n if source_type == \"transcript\":\n dl_str = f\"\\nSource Video: {dl}\"\n elif source_type == \"web\":\n dl_str = f\"\\nSource URL: {dl}\"\n else:\n dl_str = f\"\\nSource PDF: {dl}\"\n else:\n dl_str = \"\"\n\n block = f\"[{i}] {citation} (relevance: {score:.2f})\\n{summary}{facts_str}{domain_str}{dl_str}\"\n blocks.append(block)\n\n return \"\\n\\n\".join(blocks)\n\n async def inlet(\n self,\n body: dict,\n __user__: Optional[dict] = None,\n __event_emitter__: Callable[[dict], Awaitable[None]] = None,\n ) -> dict:\n if not self.valves.enabled:\n return body\n\n # Get the latest user message\n messages = body.get(\"messages\", [])\n user_messages = [m for m in messages if m.get(\"role\") == \"user\"]\n if not user_messages:\n return body\n\n query = user_messages[-1].get(\"content\", \"\")\n if not query or len(query.strip()) < 3:\n return body\n\n # \u2500\u2500 ROUTER GATE (v4.3.0) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n if self.valves.router_enabled:\n route, confidence = _classify_query(\n query, self.valves.tei_url, self.valves.router_threshold\n )\n log.info(f\"Router: {query!r} \u2192 {route} ({confidence:.3f})\")\n\n if route == \"direct_answer\":\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": \"Direct response\", \"done\": True}}\n )\n return body\n\n if route == \"nav_route\":\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": \"Getting directions...\", \"done\": False}}\n )\n result = _handle_nav_route(\n query,\n self.valves.photon_url,\n self.valves.valhalla_url,\n \"Buhl, Idaho\",\n self.valves.address_book_url,\n )\n if result:\n _inject_nav_context(body, result)\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": \"Directions ready\", \"done\": True}}\n )\n return body\n # Fall through to RAG if nav handling fails\n\n if route == \"nav_reverse_geocode\":\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": \"Looking up location...\", \"done\": False}}\n )\n result = _handle_reverse_geocode(query, self.valves.photon_url)\n if result:\n _inject_nav_context(body, result)\n if __event_emitter__:\n await __event_emitter__(\n {\"type\": \"status\", \"data\": {\"description\": \"Location found\", \"done\": True}}\n )\n return body\n # Fall through to RAG if reverse geocode fails\n\n # route == \"rag_search\" or nav fallthrough \u2192 continue existing pipeline\n\n # \u2500\u2500 EXISTING RAG PIPELINE \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n # Emit status\n if __event_emitter__:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": \"Searching RECON knowledge base...\",\n \"done\": False,\n },\n }\n )\n\n try:\n vector = self._embed_query(query)\n\n # Detect intent (knowledge_type filter)\n intent_types = self._detect_intent(query)\n\n # Exclude obstetric/midwifery content when tactical context detected\n exclude_subs = _OBSTETRIC_SUBDOMAINS if _TACTICAL_RE.search(query) else None\n\n # Start query expansion in background (runs concurrently with main search)\n expansion_executor = ThreadPoolExecutor(max_workers=1)\n expansion_future = expansion_executor.submit(self._expand_query_ollama, query)\n\n # Search Qdrant \u2014 unfiltered semantic search, optionally narrowed by knowledge_type\n pool_size = self.valves.candidate_limit\n if intent_types:\n results = self._search_qdrant(vector, pool_size, knowledge_types=intent_types,\n exclude_subdomains=exclude_subs)\n if len(results) < self.valves.fallback_min:\n results = self._search_qdrant(vector, pool_size, exclude_subdomains=exclude_subs)\n else:\n results = self._search_qdrant(vector, pool_size, exclude_subdomains=exclude_subs)\n\n # Collect expansion results and merge with main search\n try:\n expanded_terms = expansion_future.result(timeout=12)\n except Exception:\n expanded_terms = []\n expansion_executor.shutdown(wait=False)\n\n if expanded_terms:\n expanded_results = self._search_expanded_terms(\n expanded_terms, intent_types, pool_size,\n exclude_subdomains=exclude_subs,\n )\n if expanded_results:\n combined = list(results) + expanded_results\n seen: dict[str, dict] = {}\n for r in combined:\n pid = str(r.get(\"id\", \"\"))\n if pid not in seen or (r.get(\"score\") or 0) > (seen[pid].get(\"score\") or 0):\n seen[pid] = r\n results = sorted(seen.values(), key=lambda x: -(x.get(\"score\") or 0))\n\n # Guaranteed transcript inclusion for medical queries\n if _TACTICAL_RE.search(query) or any(\n kw in query.lower() for kw in (\"medical\", \"medicine\", \"wound\", \"trauma\", \"tourniquet\",\n \"hemorrhage\", \"bleeding\", \"fracture\", \"burn\", \"cpr\",\n \"first aid\", \"triage\", \"casualty\")\n ):\n transcript_results = self._fetch_guaranteed_transcripts(vector, domain=\"Medical\", limit=3, exclude_subdomains=exclude_subs)\n if transcript_results:\n combined = list(results) + transcript_results\n seen: dict[str, dict] = {}\n for r in combined:\n pid = str(r.get(\"id\", \"\"))\n if pid not in seen or (r.get(\"score\") or 0) > (seen[pid].get(\"score\") or 0):\n seen[pid] = r\n results = sorted(seen.values(), key=lambda x: -(x.get(\"score\") or 0))\n\n # Boost transcript sources across all retrieval paths\n results = self._boost_transcripts(results)\n\n # Neural reranking via FlashRank, then MMR diversity selection\n try:\n results = self._rerank_flashrank(query, results)\n results = self._mmr_select(results, self.valves.top_k)\n except Exception as e:\n log.warning(f\"FlashRank reranking failed, falling back to keyword overlap: {e}\")\n results = _rerank_by_keyword_overlap(query, results)\n results = results[:self.valves.top_k]\n\n # Store results for outlet citations (module-level, keyed by chat_id)\n chat_id = body.get(\"chat_id\", body.get(\"metadata\", {}).get(\"chat_id\", \"\"))\n if chat_id:\n _SOURCE_STORE[chat_id] = results\n\n # Build context block\n context = self._format_context(results)\n\n if context:\n rag_prompt = (\n \"You have access to the RECON knowledge base \u2014 a curated library of military field manuals, \"\n \"survival guides, preparedness literature, and video transcripts. Answer the user's question using \"\n \"the reference material below. Reference sources using [1], [2], [3] etc. matching the \"\n \"numbered sources provided. Use these numbers inline in your response.\\n\\n\"\n \"If the reference material doesn't adequately answer the question, say so explicitly rather \"\n \"than filling gaps with general knowledge.\\n\\n\"\n \"---REFERENCE MATERIAL---\\n\\n\"\n f\"{context}\\n\\n\"\n \"---END REFERENCE MATERIAL---\"\n )\n else:\n rag_prompt = (\n \"You have access to the RECON knowledge base, but no relevant reference material was \"\n \"found for this query. Answer from your general knowledge and clearly flag that your \"\n \"response is NOT backed by the RECON reference library.\"\n )\n\n # Inject into system message\n system_msg = next(\n (m for m in messages if m.get(\"role\") == \"system\"), None\n )\n if system_msg:\n system_msg[\"content\"] = system_msg[\"content\"] + \"\\n\\n\" + rag_prompt\n else:\n body[\"messages\"].insert(\n 0, {\"role\": \"system\", \"content\": rag_prompt}\n )\n\n if __event_emitter__:\n status_msg = f\"Found {len(results)} reference{'s' if len(results) != 1 else ''}\" if results else \"No matching references found\"\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": status_msg,\n \"done\": True,\n },\n }\n )\n\n except Exception as e:\n log.warning(f\"RECON RAG search failed: {e}\")\n if __event_emitter__:\n await __event_emitter__(\n {\n \"type\": \"status\",\n \"data\": {\n \"description\": \"RECON search unavailable, proceeding without references\",\n \"done\": True,\n },\n }\n )\n\n return body\n\n async def outlet(\n self,\n body: dict,\n __user__: Optional[dict] = None,\n __event_emitter__: Callable[[dict], Awaitable[None]] = None,\n ) -> dict:\n if not self.valves.enabled or not __event_emitter__:\n return body\n\n # Retrieve sources from module-level store (survives instance recreation)\n chat_id = body.get(\"chat_id\", \"\")\n sources = _SOURCE_STORE.pop(chat_id, [])\n if not sources:\n return body\n\n # Emit citations for each source used\n for r in sources:\n try:\n if not isinstance(r, dict):\n continue\n p = r.get(\"payload\") or {}\n if not isinstance(p, dict):\n p = {}\n\n # Build citation \u2014 every field defensively None-checked\n book = p.get(\"book_title\") or p.get(\"filename\") or \"Unknown Source\"\n page = p.get(\"page_ref\")\n if page is not None and str(page).strip():\n page_str = str(page).strip()\n if not page_str.startswith(\"p\"):\n page_str = f\"p. {page_str}\"\n citation_name = f\"{book}, {page_str}\"\n else:\n citation_name = str(book)\n\n download_url = str(p.get(\"download_url\") or \"\")\n\n # Safe summary extraction \u2014 handle None/missing without raising\n summary = str(p.get(\"summary\") or \"\")\n if not summary:\n content = str(p.get(\"content\") or \"\")\n summary = content[:300] if content else \"\"\n\n # Safe score formatting\n score = r.get(\"score\")\n try:\n relevance = f\"{float(score):.2f}\"\n except (TypeError, ValueError):\n relevance = \"0.00\"\n\n author = str(p.get(\"book_author\") or \"\")\n\n await __event_emitter__(\n {\n \"type\": \"source\",\n \"data\": {\n \"document\": [summary],\n \"metadata\": [\n {\n \"source\": citation_name,\n \"url\": download_url,\n \"author\": author,\n \"relevance\": relevance,\n }\n ],\n \"source\": {\n \"name\": citation_name,\n \"url\": download_url,\n },\n },\n }\n )\n except Exception as e:\n pid = r.get(\"id\", \"?\") if isinstance(r, dict) else \"?\"\n log.warning(f\"Failed to emit citation (id={pid}): {e}\")\n\n return body\n", + "meta": "{\"description\": \"RAG filter that searches the RECON knowledge base and injects reference material into Aurora's context. Emits citations with PDF download links. Supports intent-based metadata filtering, FlashRank neural reranking with MMR diversity, Ollama-powered query expansion, transcript source boosting, semantic query routing with inline navigation, and address book place resolution.\", \"manifest\": {\"title\": \"RECON Knowledge Base\", \"author\": \"Echo6\", \"version\": \"1.0.0\", \"description\": \"RAG filter with citations\"}, \"toggle\": false}", + "created_at": 1771176726, + "updated_at": 1776643065, + "valves": null, + "is_active": 1, + "is_global": 0 +}