diff --git a/config/address_book.yaml b/config/address_book.yaml new file mode 100644 index 0000000..24bc81c --- /dev/null +++ b/config/address_book.yaml @@ -0,0 +1,18 @@ +# RECON Address Book — saved locations for navigation shortcuts. +# Entries are matched by name and aliases (case-insensitive). +# Add new entries by appending to the list below. + +entries: + - id: home + name: Home + aliases: + - home + - matt's house + - 214 north st + - 214 north street + address: "214 North St, Filer, ID 83328" + lat: 42.5735833 + lon: -114.6066389 + tags: + - residence + - primary diff --git a/config/profiles/home.yaml b/config/profiles/home.yaml new file mode 100644 index 0000000..de704d9 --- /dev/null +++ b/config/profiles/home.yaml @@ -0,0 +1,67 @@ +# Deployment profile: Home (VM 1130) +# Active on the main Echo6 deployment. Full stack with planet-scale NA tiles. +# Override via RECON_PROFILE env var in /etc/systemd/system/recon.service + +profile: home +region_name: "North America" + +tileset: + url: "/tiles/planet/current.pmtiles" + bounds: [-168, 14, -52, 72] + max_zoom: 15 + attribution: "Protomaps © OSM" + +tileset_hillshade: + url: "/tiles/planet-dem.pmtiles" + encoding: "terrarium" + max_zoom: 12 + +traffic: + provider: "tomtom" + proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" + +place_details: + local_source: "nominatim" + local_bbox: [-125.0, 31.3, -104.0, 49.0] + fallback_source: "overpass" + +services: + geocode: "/api/geocode" + reverse: "/api/reverse" + address_book: "/api/address_book" + valhalla: "/valhalla" + +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + +features: + has_nominatim_details: true + has_kiwix_wiki: true + has_hillshade: true + has_3d_terrain: false + has_traffic_overlay: true + has_landclass: true + has_public_lands_layer: true + has_contours: true + has_contours_test: false + has_contours_test_10ft: false + has_address_book_write: false + has_overture_enrichment: true + has_google_places_enrichment: true + has_contacts: true + has_wiki_rewriting: true + has_wiki_discovery: false + has_usfs_trails: true + has_blm_trails: true + +defaults: + center: [42.5736, -114.6066] + zoom: 10 + +# Offroute wilderness routing +offroute: + osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf" + densify_interval_m: 100 + postgis_dsn: "dbname=padus" + diff --git a/config/profiles/minimal_pi.yaml b/config/profiles/minimal_pi.yaml new file mode 100644 index 0000000..c2fd90a --- /dev/null +++ b/config/profiles/minimal_pi.yaml @@ -0,0 +1,51 @@ +# Deployment profile: Minimal Pi (single-state pocket deployment) +# Template for the lightest possible field kit — Idaho only. +# Override via RECON_PROFILE env var. + +profile: minimal_pi +region_name: "Idaho" + +tileset: + url: "/tiles/idaho.pmtiles" + bounds: [-117.5, 42.0, -111.0, 49.0] + max_zoom: 15 + attribution: "Protomaps © OSM" + +tileset_hillshade: + url: "/tiles/hillshade-idaho.pmtiles" + encoding: "terrarium" + max_zoom: 12 + +traffic: + provider: "tomtom" + proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" + +services: + geocode: "/api/geocode" + reverse: "/api/reverse" + address_book: "/api/address_book" + valhalla: "/valhalla" + +# TODO(matt): confirm logout next= host for this profile +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + +features: + has_nominatim_details: false + has_kiwix_wiki: false + has_hillshade: false + has_3d_terrain: false + has_traffic_overlay: false + has_landclass: false + has_public_lands_layer: false + has_address_book_write: true + has_overture_enrichment: false + has_google_places_enrichment: false + has_contacts: false + has_wiki_rewriting: false + has_wiki_discovery: false + +defaults: + center: [44.0, -114.0] + zoom: 7 diff --git a/config/profiles/regional_pi.yaml b/config/profiles/regional_pi.yaml new file mode 100644 index 0000000..b6f2cad --- /dev/null +++ b/config/profiles/regional_pi.yaml @@ -0,0 +1,59 @@ +# Deployment profile: Regional Pi (multi-state field kit) +# Template for a Raspberry Pi covering Idaho + surrounding states. +# Override via RECON_PROFILE env var. + +profile: regional_pi +region_name: "Idaho + Neighbors" + +tileset: + url: "/tiles/regional.pmtiles" + bounds: [-125, 40, -104, 49] + max_zoom: 15 + attribution: "Protomaps © OSM" + +tileset_hillshade: + url: "/tiles/hillshade-regional.pmtiles" + encoding: "terrarium" + max_zoom: 12 + +traffic: + provider: "tomtom" + proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" + +place_details: + local_source: "nominatim" + local_bbox: [-125.0, 40.0, -104.0, 49.0] + fallback_source: "overpass" + +services: + geocode: "/api/geocode" + reverse: "/api/reverse" + address_book: "/api/address_book" + valhalla: "/valhalla" + +# TODO(matt): confirm logout next= host for this profile +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + +features: + has_nominatim_details: true + has_kiwix_wiki: false + has_hillshade: true + has_3d_terrain: false + has_traffic_overlay: true + has_landclass: true + has_public_lands_layer: true + has_contours: true + has_contours_test: true + has_contours_test_10ft: true + has_address_book_write: true + has_overture_enrichment: false + has_google_places_enrichment: false + has_contacts: false + has_wiki_rewriting: true + has_wiki_discovery: false + +defaults: + center: [44.0, -114.0] + zoom: 7 diff --git a/lib/api.py b/lib/api.py index 6a3d627..a0697bf 100644 --- a/lib/api.py +++ b/lib/api.py @@ -57,6 +57,10 @@ class _LargeZimRequest(_FlaskRequest): return super()._get_file_stream(total_content_length, content_type, filename, content_length) app.request_class = _LargeZimRequest +# ── Netsyms Blueprint ── +from .netsyms_api import netsyms_bp +app.register_blueprint(netsyms_bp) + # ── Navigation Constants ── @@ -1315,6 +1319,9 @@ def api_keys_reload(): return jsonify({'count': count}) + + + # ── YouTube Cookie Management ── PEERTUBE_HOST = '192.168.1.170' diff --git a/lib/aurora_nav_tool.py b/lib/aurora_nav_tool.py new file mode 100644 index 0000000..2b7285d --- /dev/null +++ b/lib/aurora_nav_tool.py @@ -0,0 +1,117 @@ +""" +title: Navigation +author: Echo6 +version: 1.1.0 +description: Turn-by-turn directions and geocoding via Photon + Valhalla on recon-vm. Supports driving, walking, cycling, and truck routing with worldwide coverage (281M places). +""" + +import re +import json +import requests +from pydantic import BaseModel, Field + +_COORD_RE = re.compile(r'^(-?\d+\.?\d*)\s*,\s*(-?\d+\.?\d*)$') + + +class Tools: + class Valves(BaseModel): + photon_url: str = Field( + default="http://100.64.0.24:2322", + description="Photon geocoding service URL (recon-vm)", + ) + valhalla_url: str = Field( + default="http://100.64.0.24:8002", + description="Valhalla routing service URL (recon-vm)", + ) + + def __init__(self): + self.valves = self.Valves() + + def _geocode(self, query: str): + m = _COORD_RE.match(query.strip()) + if m: + lat, lon = float(m.group(1)), float(m.group(2)) + return lat, lon, query + resp = requests.get( + f"{self.valves.photon_url}/api", + params={"q": query, "limit": 1}, + timeout=10, + ) + resp.raise_for_status() + features = resp.json().get("features", []) + if not features: + return None, None, None + props = features[0]["properties"] + coords = features[0]["geometry"]["coordinates"] + parts = [props.get("name", "")] + for key in ("city", "state", "country"): + v = props.get(key) + if v and v != parts[-1]: + parts.append(v) + return coords[1], coords[0], ", ".join(p for p in parts if p) + + def get_directions( + self, + origin: str, + destination: str, + mode: str = "auto", + ) -> str: + """ + Get turn-by-turn directions between two locations. When this tool returns results, present the directions exactly as returned — do not summarize or rephrase. Include all steps. + + :param origin: Starting location — address, place name, or lat,lon coordinates + :param destination: Destination — address, place name, or lat,lon coordinates + :param mode: Travel mode: auto, pedestrian, bicycle, or truck (default: auto) + :return: Formatted turn-by-turn directions + """ + if mode not in ("auto", "pedestrian", "bicycle", "truck"): + mode = "auto" + + orig_lat, orig_lon, orig_name = self._geocode(origin) + if orig_lat is None: + return f"Could not find location: {origin}" + + dest_lat, dest_lon, dest_name = self._geocode(destination) + if dest_lat is None: + return f"Could not find location: {destination}" + + try: + resp = requests.post( + f"{self.valves.valhalla_url}/route", + json={ + "locations": [ + {"lat": orig_lat, "lon": orig_lon}, + {"lat": dest_lat, "lon": dest_lon}, + ], + "costing": mode, + "directions_options": {"units": "miles"}, + }, + timeout=30, + ) + except requests.RequestException: + return "Navigation service unavailable" + + if resp.status_code != 200: + return "No route found between locations" + + trip = resp.json()["trip"] + summary = trip["summary"] + legs = trip["legs"][0]["maneuvers"] + + miles = round(summary["length"], 1) + minutes = round(summary["time"] / 60, 1) + + lines = [ + f"Directions from {orig_name} to {dest_name} ({mode}):", + f"Distance: {miles} miles | Time: {minutes} minutes", + "", + ] + for i, m in enumerate(legs, 1): + inst = m["instruction"] + dist = m.get("length", 0) + if dist > 0: + lines.append(f"{i}. {inst} — {round(dist, 1)} mi") + else: + lines.append(f"{i}. {inst}") + + return "\n".join(lines) diff --git a/lib/auth.py b/lib/auth.py new file mode 100644 index 0000000..22b08d2 --- /dev/null +++ b/lib/auth.py @@ -0,0 +1,22 @@ +""" +RECON Auth Helper — extract user identity from Authentik forward-auth headers. +""" +from functools import wraps +from flask import request, jsonify + + +def get_user_id(): + """Return X-Authentik-Username or None.""" + return request.headers.get('X-Authentik-Username') + + +def require_auth(f): + """Decorator: 401 if no Authentik auth header.""" + @wraps(f) + def wrapper(*args, **kwargs): + user_id = get_user_id() + if not user_id: + return jsonify({'error': 'Authentication required'}), 401 + request.user_id = user_id + return f(*args, **kwargs) + return wrapper diff --git a/lib/deployment_config.py b/lib/deployment_config.py new file mode 100644 index 0000000..ab6aa17 --- /dev/null +++ b/lib/deployment_config.py @@ -0,0 +1,54 @@ +""" +Deployment profile loader. + +Reads RECON_PROFILE env var (default: "home"), loads the matching YAML +from config/profiles/.yaml, and caches the parsed dict in memory. + +Exposes get_deployment_config() as the in-process accessor for the profile. + +Note: its former consumers (the /api/landclass gate, google_places, +place_detail, offroute/router) were all extracted to navi-* services or removed +across cleanups #4–#6/#27 — recon has no remaining caller of +get_deployment_config() today; the module is retained per cleanup #1. +(The former /api/config HTTP endpoint that served this dict to the frontend was +removed once navi-config (:8422) took over that route.) +""" +import os +import yaml +from .utils import setup_logging + +logger = setup_logging('recon.deployment_config') + +_config_cache = None + + +def load_deployment_config(): + """Load and cache the deployment profile. Called once at import time.""" + global _config_cache + + profile = os.environ.get('RECON_PROFILE', 'home') + config_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'config', 'profiles') + config_path = os.path.join(config_dir, f'{profile}.yaml') + + if not os.path.exists(config_path): + raise FileNotFoundError( + f"Deployment profile '{profile}' not found at {config_path}. " + f"Available profiles: {', '.join(f.replace('.yaml','') for f in os.listdir(config_dir) if f.endswith('.yaml'))}" + ) + + with open(config_path, 'r') as f: + _config_cache = yaml.safe_load(f) + + logger.info(f"Loaded deployment profile: {profile} ({_config_cache.get('region_name', 'unknown')})") + return _config_cache + + +def get_deployment_config(): + """Return the cached deployment config dict.""" + if _config_cache is None: + load_deployment_config() + return _config_cache + + +# Load on import so startup fails fast if profile is missing +load_deployment_config() diff --git a/lib/extractor.py b/lib/extractor.py index 13159c9..bc236ab 100644 --- a/lib/extractor.py +++ b/lib/extractor.py @@ -21,6 +21,7 @@ Config: processing.extract_workers, processing.max_pdf_size_mb, processing.extract_timeout, processing.page_timeout """ import base64 +import re import json import os import random @@ -99,6 +100,40 @@ def _is_transient(error_str): return any(sig in s for sig in transient_signals) +def _text_quality_ok(text, min_length=50): + """Check if extracted text meets quality thresholds. + + Beyond the basic length check, validates: + - Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars) + - Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count + + Returns True if text passes all checks. + """ + text = text.strip() + if len(text) < min_length: + return False + + words = text.split() + if not words: + return False + + # Word-like ratio: tokens with 2+ alphabetic characters + word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2) + word_ratio = word_like / len(words) + if word_ratio < 0.60: + return False + + # Concatenation detector: lowercase immediately followed by uppercase + # Filter out common camelCase patterns in code (short tokens) + concat_hits = len(re.findall(r'[a-z][A-Z]', text)) + concat_ratio = concat_hits / len(words) if words else 0 + if concat_ratio > 0.10: + return False + + return True + + + def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30): """Render a single PDF page to PNG bytes using pdftoppm. @@ -224,7 +259,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # Method 1: pdftotext (poppler) try: result = subprocess.run( - ['pdftotext', '-f', str(page_num_0indexed + 1), + ['pdftotext', '-layout', '-f', str(page_num_0indexed + 1), '-l', str(page_num_0indexed + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) @@ -233,7 +268,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pdftotext' # Method 2: pdftoppm + Tesseract OCR @@ -258,7 +293,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'tesseract' # Method 3: Gemini Vision (last resort) @@ -276,8 +311,26 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # ── Core extraction functions ── def _pypdf2_extract(reader, page_num): - """Extract text from a PyPDF2 page object. Runs inside a thread for timeout.""" - return reader.pages[page_num].extract_text() or '' + """Extract text from a PyPDF2 page object. Runs inside a thread for timeout. + + Tries default extraction first (space_width=200). If quality check fails, + retries with space_width=100 which better detects word boundaries in + tightly-kerned PDFs (common in Haynes/workshop manuals). + + Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter + controls word-boundary detection tolerance. Lower values = more aggressive + space insertion between characters. + """ + text = reader.pages[page_num].extract_text() or '' + if _text_quality_ok(text): + return text + + # Retry with tighter word-boundary detection + text_tight = reader.pages[page_num].extract_text(space_width=100.0) or '' + if len(text_tight.strip()) >= len(text.strip()): + return text_tight + + return text def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): @@ -302,13 +355,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: text = '' - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pypdf2' # Method 2: pdftotext via subprocess (inherently timeout-safe) try: result = subprocess.run( - ['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], + ['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()): @@ -316,7 +369,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pdftotext' # Method 3: pdftoppm + Tesseract OCR @@ -340,7 +393,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'tesseract' # Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs) diff --git a/lib/netsyms.py b/lib/netsyms.py new file mode 100644 index 0000000..d51162e --- /dev/null +++ b/lib/netsyms.py @@ -0,0 +1,228 @@ +""" +RECON Netsyms AddressDatabase2025 — SQLite-backed US+CA address lookup. + +Provides 159.78M geocoded addresses as tier-2 between address book +(exact named locations) and Photon (full-text global geocoding). + +Database: /mnt/nav/addresses/AddressDatabase2025.sqlite (read-only) +""" + +import os +import re +import sqlite3 +import threading + +from .utils import setup_logging + +logger = setup_logging('recon.netsyms') + +_DB_PATH = '/mnt/nav/addresses/AddressDatabase2025.sqlite' + +_conn = None +_lock = threading.Lock() +_cached_row_count = None + +# US states + DC + territories, CA provinces, for free-text parsing +_STATE_CODES = { + 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', + 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', + 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', + 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', + 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', + 'DC', 'PR', 'VI', 'GU', 'AS', 'MP', + # Canadian provinces + 'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE', + 'QC', 'SK', 'YT', +} + +_NUMBER_RE = re.compile(r'^(\d+[\w-]*)(.*)$') + + +def _get_conn(): + """Lazy-open a read-only SQLite connection.""" + global _conn + if _conn is not None: + return _conn + with _lock: + if _conn is not None: + return _conn + uri = f'file:{_DB_PATH}?mode=ro' + _conn = sqlite3.connect(uri, uri=True, check_same_thread=False) + _conn.row_factory = sqlite3.Row + logger.info("Netsyms DB opened: %s", _DB_PATH) + return _conn + + +def _row_to_dict(row): + """Convert a sqlite3.Row to a plain dict with lat/lon keys.""" + return { + 'zipcode': row['zipcode'], + 'number': row['number'], + 'street': row['street'], + 'street2': row['street2'], + 'city': row['city'], + 'state': row['state'], + 'plus4': row['plus4'], + 'country': row['country'], + 'lat': float(row['latitude']), + 'lon': float(row['longitude']), + 'source': row['source'], + } + + +def lookup_by_street(number, street, city=None, state=None, + zipcode=None, country=None, limit=20): + """Match on number + street, with optional qualifiers.""" + conn = _get_conn() + clauses = ['number = ?', 'street = ?'] + params = [str(number).strip().upper(), street.strip().upper()] + + if city: + clauses.append('city = ?') + params.append(city.strip().upper()) + if state: + clauses.append('state = ?') + params.append(state.strip().upper()) + if zipcode: + clauses.append('zipcode = ?') + params.append(zipcode.strip()) + if country: + clauses.append('country = ?') + params.append(country.strip().upper()) + + sql = f"SELECT * FROM addresses WHERE {' AND '.join(clauses)} LIMIT ?" + params.append(limit) + + with _lock: + try: + rows = conn.execute(sql, params).fetchall() + except sqlite3.Error as e: + logger.warning("Netsyms lookup_by_street error: %s", e) + return [] + + results = [_row_to_dict(r) for r in rows] + logger.debug("lookup_by_street(%s, %s, city=%s, state=%s) → %d results", + number, street, city, state, len(results)) + return results + + +def lookup_free_text(query, country_hint=None): + """Parse a free-text address and look it up.""" + q = query.strip() + if not q: + return [] + + # Strip trailing zipcode if present + zipcode = None + zip_match = re.search(r'\b(\d{5})\s*$', q) + if zip_match: + zipcode = zip_match.group(1) + q = q[:zip_match.start()].strip().rstrip(',').strip() + + # Strip trailing state + tokens = re.split(r'[,\s]+', q) + tokens = [t for t in tokens if t] + if not tokens: + return [] + + state = None + if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES: + state = tokens[-1].upper() + tokens = tokens[:-1] + + # Leading digits → number + number = None + if tokens and re.match(r'^\d', tokens[0]): + number = tokens[0] + tokens = tokens[1:] + + if not tokens: + # Only a number, or empty — try zipcode if we have one + if zipcode: + return lookup_by_zipcode(zipcode, limit=20) + return [] + + # If state was found and we have 2+ tokens remaining, last token is city + city = None + if state and len(tokens) >= 2: + city = tokens[-1] + tokens = tokens[:-1] + + street = ' '.join(tokens) + + if number: + results = lookup_by_street(number, street, city=city, state=state, + zipcode=zipcode, country=country_hint) + if results: + logger.debug("lookup_free_text(%r) → %d results via street match", + query, len(results)) + return results + + # Fallback: try zipcode only if available + if zipcode: + return lookup_by_zipcode(zipcode, limit=20) + + logger.debug("lookup_free_text(%r) → 0 results", query) + return [] + + +def lookup_by_zipcode(zipcode, limit=100): + """Direct zipcode lookup.""" + conn = _get_conn() + sql = "SELECT * FROM addresses WHERE zipcode = ? LIMIT ?" + params = [zipcode.strip(), limit] + + with _lock: + try: + rows = conn.execute(sql, params).fetchall() + except sqlite3.Error as e: + logger.warning("Netsyms lookup_by_zipcode error: %s", e) + return [] + + results = [_row_to_dict(r) for r in rows] + logger.debug("lookup_by_zipcode(%s) → %d results", zipcode, len(results)) + return results + + +def health(): + """Health check with cached row count.""" + global _cached_row_count + + try: + file_size = os.path.getsize(_DB_PATH) + except OSError: + return {'ok': False, 'row_count': 0, 'file_size_bytes': 0, + 'indexed_countries': []} + + try: + conn = _get_conn() + except Exception: + return {'ok': False, 'row_count': 0, 'file_size_bytes': file_size, + 'indexed_countries': []} + + if _cached_row_count is None: + with _lock: + if _cached_row_count is None: + try: + row = conn.execute( + "SELECT COUNT(*) AS cnt FROM addresses" + ).fetchone() + _cached_row_count = row['cnt'] + except sqlite3.Error: + _cached_row_count = 0 + + with _lock: + try: + rows = conn.execute( + "SELECT DISTINCT country FROM addresses" + ).fetchall() + countries = sorted(r['country'] for r in rows) + except sqlite3.Error: + countries = [] + + return { + 'ok': True, + 'row_count': _cached_row_count, + 'file_size_bytes': file_size, + 'indexed_countries': countries, + } diff --git a/lib/netsyms_api.py b/lib/netsyms_api.py new file mode 100644 index 0000000..dbae24e --- /dev/null +++ b/lib/netsyms_api.py @@ -0,0 +1,31 @@ +""" +RECON Netsyms API — Flask Blueprint. + +GET /api/netsyms/lookup?q=&country= +GET /api/netsyms/health +""" + +from flask import Blueprint, request, jsonify + +from . import netsyms +from .utils import setup_logging + +logger = setup_logging('recon.netsyms_api') + +netsyms_bp = Blueprint('netsyms', __name__) + + +@netsyms_bp.route('/api/netsyms/lookup') +def api_netsyms_lookup(): + q = request.args.get('q', '').strip() + if not q: + return jsonify({'error': 'Missing q parameter'}), 400 + + country = request.args.get('country', '').strip() or None + results = netsyms.lookup_free_text(q, country_hint=country) + return jsonify({'results': results, 'count': len(results), 'query': q}) + + +@netsyms_bp.route('/api/netsyms/health') +def api_netsyms_health(): + return jsonify(netsyms.health()) diff --git a/lib/netsyms_test.py b/lib/netsyms_test.py new file mode 100644 index 0000000..ed70472 --- /dev/null +++ b/lib/netsyms_test.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""Tests for Netsyms address database module.""" + +import sys +import os + +# Ensure the lib directory is importable +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from lib import netsyms + + +def test_lookup_by_street_lowercase(): + results = netsyms.lookup_by_street("214", "North St", city="Filer", state="ID") + assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" + r = results[0] + assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}" + assert abs(r['lon'] - (-114.6066)) < 0.01, f"Lon mismatch: {r['lon']}" + print(" PASS: lookup_by_street (lowercase)") + + +def test_lookup_by_street_uppercase(): + results = netsyms.lookup_by_street("214", "NORTH ST", city="FILER", state="ID") + assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" + r = results[0] + assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}" + print(" PASS: lookup_by_street (uppercase)") + + +def test_lookup_nonexistent(): + results = netsyms.lookup_by_street("999999", "Nonexistent Rd", + city="Filer", state="ID") + assert results == [], f"Expected empty list, got {len(results)} results" + print(" PASS: lookup_by_street (nonexistent)") + + +def test_free_text_with_commas(): + results = netsyms.lookup_free_text("214 North St, Filer, ID") + assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" + r = results[0] + assert r['city'] == 'FILER', f"City mismatch: {r['city']}" + assert r['state'] == 'ID', f"State mismatch: {r['state']}" + print(" PASS: lookup_free_text (commas)") + + +def test_free_text_no_commas(): + results = netsyms.lookup_free_text("214 North St Filer ID") + assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" + r = results[0] + assert r['state'] == 'ID', f"State mismatch: {r['state']}" + print(" PASS: lookup_free_text (no commas)") + + +def test_lookup_by_zipcode(): + results = netsyms.lookup_by_zipcode("83328", limit=5) + assert len(results) == 5, f"Expected 5 results, got {len(results)}" + for r in results: + assert r['zipcode'] == '83328', f"Zipcode mismatch: {r['zipcode']}" + print(" PASS: lookup_by_zipcode") + + +def test_health(): + h = netsyms.health() + assert h['ok'] is True, f"Health not OK: {h}" + assert h['row_count'] >= 159_000_000, f"Row count too low: {h['row_count']}" + assert 'US' in h['indexed_countries'], f"US not in countries: {h['indexed_countries']}" + assert 'CA' in h['indexed_countries'], f"CA not in countries: {h['indexed_countries']}" + print(" PASS: health") + + +if __name__ == '__main__': + print("Running Netsyms tests...") + test_lookup_by_street_lowercase() + test_lookup_by_street_uppercase() + test_lookup_nonexistent() + test_free_text_with_commas() + test_free_text_no_commas() + test_lookup_by_zipcode() + test_health() + print("All tests passed.") diff --git a/lib/processors/zim_processor.py b/lib/processors/zim_processor.py index b258408..6f5c887 100644 --- a/lib/processors/zim_processor.py +++ b/lib/processors/zim_processor.py @@ -77,10 +77,73 @@ def _text_hash(text): return hashlib.md5(text.encode('utf-8')).hexdigest() +def _flatten_table(table_el): + """Convert a element to pipe-delimited text. + + Each becomes a row with cells joined by ' | '. + Returns the formatted table as a string with blank lines around it. + """ + rows = [] + for tr in table_el.iter('tr'): + cells = [] + for cell in tr: + if cell.tag in ('td', 'th'): + cell_text = (cell.text_content() or '').strip() + # Collapse internal whitespace in each cell + cell_text = re.sub(r'\s+', ' ', cell_text) + if cell_text: + cells.append(cell_text) + if cells: + rows.append(' | '.join(cells)) + if not rows: + return '' + return '\n'.join(rows) + + +def _preprocess_tree(doc): + """Pre-process HTML tree to add delimiters before text_content() flattens it. + + Handles:
,
,
  • ,
    ,
    -- elements that lxml's + text_content() would concatenate without any separators. + """ + from lxml import etree + + # 1. Replace
  • elements with their pipe-delimited text + for table in list(doc.iter('table')): + formatted = _flatten_table(table) + if formatted: + replacement = etree.Element('div') + replacement.text = '\n\n' + formatted + '\n\n' + parent = table.getparent() + if parent is not None: + parent.replace(table, replacement) + else: + table.drop_tree() + + # 2.
    -> inject newline + for br in list(doc.iter('br')): + br.tail = '\n' + (br.tail or '') + + # 3.
  • -> inject newline + "- " prefix + for li in list(doc.iter('li')): + li.text = '- ' + (li.text or '') + li.tail = '\n' + (li.tail or '') + + # 4.
    -> inject newline before + for dt in list(doc.iter('dt')): + dt.tail = '\n' + (dt.tail or '') + + # 5.
    -> inject newline + indent + for dd in list(doc.iter('dd')): + dd.text = ' ' + (dd.text or '') + dd.tail = '\n' + (dd.tail or '') + + def _html_to_text(html_bytes): """Convert HTML bytes to clean text via lxml. Strips nav, footer, script, style elements. Decodes entities. + Pre-processes tables, lists, and line breaks for proper delimiters. Normalizes whitespace. """ try: @@ -93,6 +156,9 @@ def _html_to_text(html_bytes): for el in doc.iter(tag): el.drop_tree() + # Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes + _preprocess_tree(doc) + # Extract text text = doc.text_content() diff --git a/lib/query_router.py b/lib/query_router.py new file mode 100644 index 0000000..dda14a2 --- /dev/null +++ b/lib/query_router.py @@ -0,0 +1,161 @@ +"""Semantic query router for Aurora. + +Classifies user queries into routes (nav_route, nav_reverse_geocode, +direct_answer, rag_search) by comparing query embeddings against +pre-computed route centroids from example queries. + +TEI endpoint: http://100.64.0.14:8090/embed (cortex via Tailscale) +""" + +import math +import threading +import requests + +# ── Route examples ──────────────────────────────────────────────────────────── +ROUTE_EXAMPLES = { + "nav_route": [ + "how do I get to Boise", + "directions to Twin Falls", + "how do I get from Buhl to Boise", + "drive from Jerome to Sun Valley", + "route from Boise to McCall", + "what's the fastest way to Sun Valley", + "how far is it to Twin Falls", + "take me to Shoshone", + "navigate to the airport", + "how do I drive to Salt Lake City", + "walking directions to the park", + "bike route to downtown", + ], + "nav_reverse_geocode": [ + "what town is at 42.5, -114.7", + "where am I right now", + "what is at coordinates 43.6, -116.2", + "what location is 42.574, -114.607", + "where is this place 44.0, -114.3", + "what city is near 42.7, -114.5", + "reverse geocode 43.0, -115.0", + "what's at this location 42.9, -114.8", + ], + "direct_answer": [ + "hello", + "hey aurora", + "good morning", + "thanks", + "thank you", + "what's your name", + "who are you", + "tell me a joke", + "how are you", + "hi there", + ], + "rag_search": [ + "what does the survival manual say about water", + "how to purify water in the field", + "how to treat a gunshot wound", + "what is the ranger handbook chapter on patrolling", + "field manual water purification", + "how to build a shelter in the wilderness", + "tactical combat casualty care procedures", + "what does FM 21-76 say about fire starting", + ], +} + +# ── Module-level cache ──────────────────────────────────────────────────────── +_ROUTE_CENTROIDS: dict | None = None +_LOCK = threading.Lock() + + +def _embed_batch(texts: list[str], tei_url: str) -> list[list[float]]: + """Embed a batch of texts via TEI.""" + resp = requests.post(tei_url, json={"inputs": texts}, timeout=30) + resp.raise_for_status() + return resp.json() + + +def _compute_centroid(vectors: list[list[float]]) -> list[float]: + """Element-wise mean of vectors.""" + n = len(vectors) + dim = len(vectors[0]) + centroid = [0.0] * dim + for vec in vectors: + for i in range(dim): + centroid[i] += vec[i] + for i in range(dim): + centroid[i] /= n + return centroid + + +def _cosine_similarity(a: list[float], b: list[float]) -> float: + """Cosine similarity between two vectors (pure Python).""" + dot = 0.0 + norm_a = 0.0 + norm_b = 0.0 + for i in range(len(a)): + dot += a[i] * b[i] + norm_a += a[i] * a[i] + norm_b += b[i] * b[i] + denom = math.sqrt(norm_a) * math.sqrt(norm_b) + if denom == 0: + return 0.0 + return dot / denom + + +def _ensure_centroids(tei_url: str) -> dict[str, list[float]]: + """Lazy-init: embed all examples in one batch, compute centroids, cache.""" + global _ROUTE_CENTROIDS + if _ROUTE_CENTROIDS is not None: + return _ROUTE_CENTROIDS + + with _LOCK: + if _ROUTE_CENTROIDS is not None: + return _ROUTE_CENTROIDS + + # Flatten all examples into one batch + all_texts = [] + route_ranges: dict[str, tuple[int, int]] = {} + offset = 0 + for route, examples in ROUTE_EXAMPLES.items(): + route_ranges[route] = (offset, offset + len(examples)) + all_texts.extend(examples) + offset += len(examples) + + all_vectors = _embed_batch(all_texts, tei_url) + + centroids = {} + for route, (start, end) in route_ranges.items(): + centroids[route] = _compute_centroid(all_vectors[start:end]) + + _ROUTE_CENTROIDS = centroids + return _ROUTE_CENTROIDS + + +def classify( + query: str, + tei_url: str = "http://100.64.0.14:8090/embed", + threshold: float = 0.45, +) -> tuple[str, float]: + """Classify a query into a route. + + Returns (route_name, confidence). If no route exceeds the threshold, + returns ("rag_search", best_score) as the safe default. + """ + centroids = _ensure_centroids(tei_url) + + # Embed the query + vecs = _embed_batch([query], tei_url) + query_vec = vecs[0] + + # Compare against all centroids + best_route = "rag_search" + best_score = 0.0 + for route, centroid in centroids.items(): + sim = _cosine_similarity(query_vec, centroid) + if sim > best_score: + best_score = sim + best_route = route + + if best_score < threshold: + return ("rag_search", best_score) + + return (best_route, best_score) diff --git a/lib/query_router_test.py b/lib/query_router_test.py new file mode 100644 index 0000000..27ccefd --- /dev/null +++ b/lib/query_router_test.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +"""Test suite for the semantic query router.""" + +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from lib.query_router import classify + +TEST_QUERIES = [ + ("how do I get from Buhl to Boise", "nav_route"), + ("what does the survival manual say about water", "rag_search"), + ("what town is at 42.5, -114.7", "nav_reverse_geocode"), + ("hey aurora", "direct_answer"), + ("what's the fastest way to Sun Valley", "nav_route"), + ("how to purify water in the field", "rag_search"), + ("good morning", "direct_answer"), +] + + +def main(): + print("Query Router Test Suite") + print("=" * 70) + + passed = 0 + failed = 0 + + for query, expected in TEST_QUERIES: + route, confidence = classify(query) + status = "PASS" if route == expected else "FAIL" + if status == "PASS": + passed += 1 + else: + failed += 1 + print(f" [{status}] {query!r}") + print(f" → {route} ({confidence:.3f}) expected={expected}") + + print("=" * 70) + print(f"Results: {passed}/{passed + failed} passed") + if failed: + print(f" {failed} FAILED") + sys.exit(1) + else: + print(" All tests passed!") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index f643cd8..1da21bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ anyio==4.12.1 babel==2.18.0 beautifulsoup4==4.14.3 blinker==1.9.0 +cachetools==7.1.3 certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4