diff --git a/config/address_book.yaml b/config/address_book.yaml deleted file mode 100644 index 24bc81c..0000000 --- a/config/address_book.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# RECON Address Book — saved locations for navigation shortcuts. -# Entries are matched by name and aliases (case-insensitive). -# Add new entries by appending to the list below. - -entries: - - id: home - name: Home - aliases: - - home - - matt's house - - 214 north st - - 214 north street - address: "214 North St, Filer, ID 83328" - lat: 42.5735833 - lon: -114.6066389 - tags: - - residence - - primary diff --git a/config/profiles/home.yaml b/config/profiles/home.yaml deleted file mode 100644 index de704d9..0000000 --- a/config/profiles/home.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# Deployment profile: Home (VM 1130) -# Active on the main Echo6 deployment. Full stack with planet-scale NA tiles. -# Override via RECON_PROFILE env var in /etc/systemd/system/recon.service - -profile: home -region_name: "North America" - -tileset: - url: "/tiles/planet/current.pmtiles" - bounds: [-168, 14, -52, 72] - max_zoom: 15 - attribution: "Protomaps © OSM" - -tileset_hillshade: - url: "/tiles/planet-dem.pmtiles" - encoding: "terrarium" - max_zoom: 12 - -traffic: - provider: "tomtom" - proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" - -place_details: - local_source: "nominatim" - local_bbox: [-125.0, 31.3, -104.0, 49.0] - fallback_source: "overpass" - -services: - geocode: "/api/geocode" - reverse: "/api/reverse" - address_book: "/api/address_book" - valhalla: "/valhalla" - -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - -features: - has_nominatim_details: true - has_kiwix_wiki: true - has_hillshade: true - has_3d_terrain: false - has_traffic_overlay: true - has_landclass: true - has_public_lands_layer: true - has_contours: true - has_contours_test: false - has_contours_test_10ft: false - has_address_book_write: false - has_overture_enrichment: true - has_google_places_enrichment: true - has_contacts: true - has_wiki_rewriting: true - has_wiki_discovery: false - has_usfs_trails: true - has_blm_trails: true - -defaults: - center: [42.5736, -114.6066] - zoom: 10 - -# Offroute wilderness routing -offroute: - osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf" - densify_interval_m: 100 - postgis_dsn: "dbname=padus" - diff --git a/config/profiles/minimal_pi.yaml b/config/profiles/minimal_pi.yaml deleted file mode 100644 index c2fd90a..0000000 --- a/config/profiles/minimal_pi.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# Deployment profile: Minimal Pi (single-state pocket deployment) -# Template for the lightest possible field kit — Idaho only. -# Override via RECON_PROFILE env var. - -profile: minimal_pi -region_name: "Idaho" - -tileset: - url: "/tiles/idaho.pmtiles" - bounds: [-117.5, 42.0, -111.0, 49.0] - max_zoom: 15 - attribution: "Protomaps © OSM" - -tileset_hillshade: - url: "/tiles/hillshade-idaho.pmtiles" - encoding: "terrarium" - max_zoom: 12 - -traffic: - provider: "tomtom" - proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" - -services: - geocode: "/api/geocode" - reverse: "/api/reverse" - address_book: "/api/address_book" - valhalla: "/valhalla" - -# TODO(matt): confirm logout next= host for this profile -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - -features: - has_nominatim_details: false - has_kiwix_wiki: false - has_hillshade: false - has_3d_terrain: false - has_traffic_overlay: false - has_landclass: false - has_public_lands_layer: false - has_address_book_write: true - has_overture_enrichment: false - has_google_places_enrichment: false - has_contacts: false - has_wiki_rewriting: false - has_wiki_discovery: false - -defaults: - center: [44.0, -114.0] - zoom: 7 diff --git a/config/profiles/regional_pi.yaml b/config/profiles/regional_pi.yaml deleted file mode 100644 index b6f2cad..0000000 --- a/config/profiles/regional_pi.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# Deployment profile: Regional Pi (multi-state field kit) -# Template for a Raspberry Pi covering Idaho + surrounding states. -# Override via RECON_PROFILE env var. - -profile: regional_pi -region_name: "Idaho + Neighbors" - -tileset: - url: "/tiles/regional.pmtiles" - bounds: [-125, 40, -104, 49] - max_zoom: 15 - attribution: "Protomaps © OSM" - -tileset_hillshade: - url: "/tiles/hillshade-regional.pmtiles" - encoding: "terrarium" - max_zoom: 12 - -traffic: - provider: "tomtom" - proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png" - -place_details: - local_source: "nominatim" - local_bbox: [-125.0, 40.0, -104.0, 49.0] - fallback_source: "overpass" - -services: - geocode: "/api/geocode" - reverse: "/api/reverse" - address_book: "/api/address_book" - valhalla: "/valhalla" - -# TODO(matt): confirm logout next= host for this profile -auth: - login_url: "/outpost.goauthentik.io/start?rd=%2F" - logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" - -features: - has_nominatim_details: true - has_kiwix_wiki: false - has_hillshade: true - has_3d_terrain: false - has_traffic_overlay: true - has_landclass: true - has_public_lands_layer: true - has_contours: true - has_contours_test: true - has_contours_test_10ft: true - has_address_book_write: true - has_overture_enrichment: false - has_google_places_enrichment: false - has_contacts: false - has_wiki_rewriting: true - has_wiki_discovery: false - -defaults: - center: [44.0, -114.0] - zoom: 7 diff --git a/lib/api.py b/lib/api.py index a0697bf..6a3d627 100644 --- a/lib/api.py +++ b/lib/api.py @@ -57,10 +57,6 @@ class _LargeZimRequest(_FlaskRequest): return super()._get_file_stream(total_content_length, content_type, filename, content_length) app.request_class = _LargeZimRequest -# ── Netsyms Blueprint ── -from .netsyms_api import netsyms_bp -app.register_blueprint(netsyms_bp) - # ── Navigation Constants ── @@ -1319,9 +1315,6 @@ def api_keys_reload(): return jsonify({'count': count}) - - - # ── YouTube Cookie Management ── PEERTUBE_HOST = '192.168.1.170' diff --git a/lib/aurora_nav_tool.py b/lib/aurora_nav_tool.py deleted file mode 100644 index 2b7285d..0000000 --- a/lib/aurora_nav_tool.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -title: Navigation -author: Echo6 -version: 1.1.0 -description: Turn-by-turn directions and geocoding via Photon + Valhalla on recon-vm. Supports driving, walking, cycling, and truck routing with worldwide coverage (281M places). -""" - -import re -import json -import requests -from pydantic import BaseModel, Field - -_COORD_RE = re.compile(r'^(-?\d+\.?\d*)\s*,\s*(-?\d+\.?\d*)$') - - -class Tools: - class Valves(BaseModel): - photon_url: str = Field( - default="http://100.64.0.24:2322", - description="Photon geocoding service URL (recon-vm)", - ) - valhalla_url: str = Field( - default="http://100.64.0.24:8002", - description="Valhalla routing service URL (recon-vm)", - ) - - def __init__(self): - self.valves = self.Valves() - - def _geocode(self, query: str): - m = _COORD_RE.match(query.strip()) - if m: - lat, lon = float(m.group(1)), float(m.group(2)) - return lat, lon, query - resp = requests.get( - f"{self.valves.photon_url}/api", - params={"q": query, "limit": 1}, - timeout=10, - ) - resp.raise_for_status() - features = resp.json().get("features", []) - if not features: - return None, None, None - props = features[0]["properties"] - coords = features[0]["geometry"]["coordinates"] - parts = [props.get("name", "")] - for key in ("city", "state", "country"): - v = props.get(key) - if v and v != parts[-1]: - parts.append(v) - return coords[1], coords[0], ", ".join(p for p in parts if p) - - def get_directions( - self, - origin: str, - destination: str, - mode: str = "auto", - ) -> str: - """ - Get turn-by-turn directions between two locations. When this tool returns results, present the directions exactly as returned — do not summarize or rephrase. Include all steps. - - :param origin: Starting location — address, place name, or lat,lon coordinates - :param destination: Destination — address, place name, or lat,lon coordinates - :param mode: Travel mode: auto, pedestrian, bicycle, or truck (default: auto) - :return: Formatted turn-by-turn directions - """ - if mode not in ("auto", "pedestrian", "bicycle", "truck"): - mode = "auto" - - orig_lat, orig_lon, orig_name = self._geocode(origin) - if orig_lat is None: - return f"Could not find location: {origin}" - - dest_lat, dest_lon, dest_name = self._geocode(destination) - if dest_lat is None: - return f"Could not find location: {destination}" - - try: - resp = requests.post( - f"{self.valves.valhalla_url}/route", - json={ - "locations": [ - {"lat": orig_lat, "lon": orig_lon}, - {"lat": dest_lat, "lon": dest_lon}, - ], - "costing": mode, - "directions_options": {"units": "miles"}, - }, - timeout=30, - ) - except requests.RequestException: - return "Navigation service unavailable" - - if resp.status_code != 200: - return "No route found between locations" - - trip = resp.json()["trip"] - summary = trip["summary"] - legs = trip["legs"][0]["maneuvers"] - - miles = round(summary["length"], 1) - minutes = round(summary["time"] / 60, 1) - - lines = [ - f"Directions from {orig_name} to {dest_name} ({mode}):", - f"Distance: {miles} miles | Time: {minutes} minutes", - "", - ] - for i, m in enumerate(legs, 1): - inst = m["instruction"] - dist = m.get("length", 0) - if dist > 0: - lines.append(f"{i}. {inst} — {round(dist, 1)} mi") - else: - lines.append(f"{i}. {inst}") - - return "\n".join(lines) diff --git a/lib/auth.py b/lib/auth.py deleted file mode 100644 index 22b08d2..0000000 --- a/lib/auth.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -RECON Auth Helper — extract user identity from Authentik forward-auth headers. -""" -from functools import wraps -from flask import request, jsonify - - -def get_user_id(): - """Return X-Authentik-Username or None.""" - return request.headers.get('X-Authentik-Username') - - -def require_auth(f): - """Decorator: 401 if no Authentik auth header.""" - @wraps(f) - def wrapper(*args, **kwargs): - user_id = get_user_id() - if not user_id: - return jsonify({'error': 'Authentication required'}), 401 - request.user_id = user_id - return f(*args, **kwargs) - return wrapper diff --git a/lib/deployment_config.py b/lib/deployment_config.py deleted file mode 100644 index ab6aa17..0000000 --- a/lib/deployment_config.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Deployment profile loader. - -Reads RECON_PROFILE env var (default: "home"), loads the matching YAML -from config/profiles/.yaml, and caches the parsed dict in memory. - -Exposes get_deployment_config() as the in-process accessor for the profile. - -Note: its former consumers (the /api/landclass gate, google_places, -place_detail, offroute/router) were all extracted to navi-* services or removed -across cleanups #4–#6/#27 — recon has no remaining caller of -get_deployment_config() today; the module is retained per cleanup #1. -(The former /api/config HTTP endpoint that served this dict to the frontend was -removed once navi-config (:8422) took over that route.) -""" -import os -import yaml -from .utils import setup_logging - -logger = setup_logging('recon.deployment_config') - -_config_cache = None - - -def load_deployment_config(): - """Load and cache the deployment profile. Called once at import time.""" - global _config_cache - - profile = os.environ.get('RECON_PROFILE', 'home') - config_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'config', 'profiles') - config_path = os.path.join(config_dir, f'{profile}.yaml') - - if not os.path.exists(config_path): - raise FileNotFoundError( - f"Deployment profile '{profile}' not found at {config_path}. " - f"Available profiles: {', '.join(f.replace('.yaml','') for f in os.listdir(config_dir) if f.endswith('.yaml'))}" - ) - - with open(config_path, 'r') as f: - _config_cache = yaml.safe_load(f) - - logger.info(f"Loaded deployment profile: {profile} ({_config_cache.get('region_name', 'unknown')})") - return _config_cache - - -def get_deployment_config(): - """Return the cached deployment config dict.""" - if _config_cache is None: - load_deployment_config() - return _config_cache - - -# Load on import so startup fails fast if profile is missing -load_deployment_config() diff --git a/lib/extractor.py b/lib/extractor.py index bc236ab..13159c9 100644 --- a/lib/extractor.py +++ b/lib/extractor.py @@ -21,7 +21,6 @@ Config: processing.extract_workers, processing.max_pdf_size_mb, processing.extract_timeout, processing.page_timeout """ import base64 -import re import json import os import random @@ -100,40 +99,6 @@ def _is_transient(error_str): return any(sig in s for sig in transient_signals) -def _text_quality_ok(text, min_length=50): - """Check if extracted text meets quality thresholds. - - Beyond the basic length check, validates: - - Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars) - - Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count - - Returns True if text passes all checks. - """ - text = text.strip() - if len(text) < min_length: - return False - - words = text.split() - if not words: - return False - - # Word-like ratio: tokens with 2+ alphabetic characters - word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2) - word_ratio = word_like / len(words) - if word_ratio < 0.60: - return False - - # Concatenation detector: lowercase immediately followed by uppercase - # Filter out common camelCase patterns in code (short tokens) - concat_hits = len(re.findall(r'[a-z][A-Z]', text)) - concat_ratio = concat_hits / len(words) if words else 0 - if concat_ratio > 0.10: - return False - - return True - - - def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30): """Render a single PDF page to PNG bytes using pdftoppm. @@ -259,7 +224,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # Method 1: pdftotext (poppler) try: result = subprocess.run( - ['pdftotext', '-layout', '-f', str(page_num_0indexed + 1), + ['pdftotext', '-f', str(page_num_0indexed + 1), '-l', str(page_num_0indexed + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) @@ -268,7 +233,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if _text_quality_ok(text): + if len(text.strip()) >= 50: return text, 'pdftotext' # Method 2: pdftoppm + Tesseract OCR @@ -293,7 +258,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if _text_quality_ok(text): + if len(text.strip()) >= 50: return text, 'tesseract' # Method 3: Gemini Vision (last resort) @@ -311,26 +276,8 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # ── Core extraction functions ── def _pypdf2_extract(reader, page_num): - """Extract text from a PyPDF2 page object. Runs inside a thread for timeout. - - Tries default extraction first (space_width=200). If quality check fails, - retries with space_width=100 which better detects word boundaries in - tightly-kerned PDFs (common in Haynes/workshop manuals). - - Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter - controls word-boundary detection tolerance. Lower values = more aggressive - space insertion between characters. - """ - text = reader.pages[page_num].extract_text() or '' - if _text_quality_ok(text): - return text - - # Retry with tighter word-boundary detection - text_tight = reader.pages[page_num].extract_text(space_width=100.0) or '' - if len(text_tight.strip()) >= len(text.strip()): - return text_tight - - return text + """Extract text from a PyPDF2 page object. Runs inside a thread for timeout.""" + return reader.pages[page_num].extract_text() or '' def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): @@ -355,13 +302,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: text = '' - if _text_quality_ok(text): + if len(text.strip()) >= 50: return text, 'pypdf2' # Method 2: pdftotext via subprocess (inherently timeout-safe) try: result = subprocess.run( - ['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], + ['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()): @@ -369,7 +316,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if _text_quality_ok(text): + if len(text.strip()) >= 50: return text, 'pdftotext' # Method 3: pdftoppm + Tesseract OCR @@ -393,7 +340,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if _text_quality_ok(text): + if len(text.strip()) >= 50: return text, 'tesseract' # Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs) diff --git a/lib/netsyms.py b/lib/netsyms.py deleted file mode 100644 index d51162e..0000000 --- a/lib/netsyms.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -RECON Netsyms AddressDatabase2025 — SQLite-backed US+CA address lookup. - -Provides 159.78M geocoded addresses as tier-2 between address book -(exact named locations) and Photon (full-text global geocoding). - -Database: /mnt/nav/addresses/AddressDatabase2025.sqlite (read-only) -""" - -import os -import re -import sqlite3 -import threading - -from .utils import setup_logging - -logger = setup_logging('recon.netsyms') - -_DB_PATH = '/mnt/nav/addresses/AddressDatabase2025.sqlite' - -_conn = None -_lock = threading.Lock() -_cached_row_count = None - -# US states + DC + territories, CA provinces, for free-text parsing -_STATE_CODES = { - 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', - 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', - 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', - 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', - 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', - 'DC', 'PR', 'VI', 'GU', 'AS', 'MP', - # Canadian provinces - 'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE', - 'QC', 'SK', 'YT', -} - -_NUMBER_RE = re.compile(r'^(\d+[\w-]*)(.*)$') - - -def _get_conn(): - """Lazy-open a read-only SQLite connection.""" - global _conn - if _conn is not None: - return _conn - with _lock: - if _conn is not None: - return _conn - uri = f'file:{_DB_PATH}?mode=ro' - _conn = sqlite3.connect(uri, uri=True, check_same_thread=False) - _conn.row_factory = sqlite3.Row - logger.info("Netsyms DB opened: %s", _DB_PATH) - return _conn - - -def _row_to_dict(row): - """Convert a sqlite3.Row to a plain dict with lat/lon keys.""" - return { - 'zipcode': row['zipcode'], - 'number': row['number'], - 'street': row['street'], - 'street2': row['street2'], - 'city': row['city'], - 'state': row['state'], - 'plus4': row['plus4'], - 'country': row['country'], - 'lat': float(row['latitude']), - 'lon': float(row['longitude']), - 'source': row['source'], - } - - -def lookup_by_street(number, street, city=None, state=None, - zipcode=None, country=None, limit=20): - """Match on number + street, with optional qualifiers.""" - conn = _get_conn() - clauses = ['number = ?', 'street = ?'] - params = [str(number).strip().upper(), street.strip().upper()] - - if city: - clauses.append('city = ?') - params.append(city.strip().upper()) - if state: - clauses.append('state = ?') - params.append(state.strip().upper()) - if zipcode: - clauses.append('zipcode = ?') - params.append(zipcode.strip()) - if country: - clauses.append('country = ?') - params.append(country.strip().upper()) - - sql = f"SELECT * FROM addresses WHERE {' AND '.join(clauses)} LIMIT ?" - params.append(limit) - - with _lock: - try: - rows = conn.execute(sql, params).fetchall() - except sqlite3.Error as e: - logger.warning("Netsyms lookup_by_street error: %s", e) - return [] - - results = [_row_to_dict(r) for r in rows] - logger.debug("lookup_by_street(%s, %s, city=%s, state=%s) → %d results", - number, street, city, state, len(results)) - return results - - -def lookup_free_text(query, country_hint=None): - """Parse a free-text address and look it up.""" - q = query.strip() - if not q: - return [] - - # Strip trailing zipcode if present - zipcode = None - zip_match = re.search(r'\b(\d{5})\s*$', q) - if zip_match: - zipcode = zip_match.group(1) - q = q[:zip_match.start()].strip().rstrip(',').strip() - - # Strip trailing state - tokens = re.split(r'[,\s]+', q) - tokens = [t for t in tokens if t] - if not tokens: - return [] - - state = None - if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES: - state = tokens[-1].upper() - tokens = tokens[:-1] - - # Leading digits → number - number = None - if tokens and re.match(r'^\d', tokens[0]): - number = tokens[0] - tokens = tokens[1:] - - if not tokens: - # Only a number, or empty — try zipcode if we have one - if zipcode: - return lookup_by_zipcode(zipcode, limit=20) - return [] - - # If state was found and we have 2+ tokens remaining, last token is city - city = None - if state and len(tokens) >= 2: - city = tokens[-1] - tokens = tokens[:-1] - - street = ' '.join(tokens) - - if number: - results = lookup_by_street(number, street, city=city, state=state, - zipcode=zipcode, country=country_hint) - if results: - logger.debug("lookup_free_text(%r) → %d results via street match", - query, len(results)) - return results - - # Fallback: try zipcode only if available - if zipcode: - return lookup_by_zipcode(zipcode, limit=20) - - logger.debug("lookup_free_text(%r) → 0 results", query) - return [] - - -def lookup_by_zipcode(zipcode, limit=100): - """Direct zipcode lookup.""" - conn = _get_conn() - sql = "SELECT * FROM addresses WHERE zipcode = ? LIMIT ?" - params = [zipcode.strip(), limit] - - with _lock: - try: - rows = conn.execute(sql, params).fetchall() - except sqlite3.Error as e: - logger.warning("Netsyms lookup_by_zipcode error: %s", e) - return [] - - results = [_row_to_dict(r) for r in rows] - logger.debug("lookup_by_zipcode(%s) → %d results", zipcode, len(results)) - return results - - -def health(): - """Health check with cached row count.""" - global _cached_row_count - - try: - file_size = os.path.getsize(_DB_PATH) - except OSError: - return {'ok': False, 'row_count': 0, 'file_size_bytes': 0, - 'indexed_countries': []} - - try: - conn = _get_conn() - except Exception: - return {'ok': False, 'row_count': 0, 'file_size_bytes': file_size, - 'indexed_countries': []} - - if _cached_row_count is None: - with _lock: - if _cached_row_count is None: - try: - row = conn.execute( - "SELECT COUNT(*) AS cnt FROM addresses" - ).fetchone() - _cached_row_count = row['cnt'] - except sqlite3.Error: - _cached_row_count = 0 - - with _lock: - try: - rows = conn.execute( - "SELECT DISTINCT country FROM addresses" - ).fetchall() - countries = sorted(r['country'] for r in rows) - except sqlite3.Error: - countries = [] - - return { - 'ok': True, - 'row_count': _cached_row_count, - 'file_size_bytes': file_size, - 'indexed_countries': countries, - } diff --git a/lib/netsyms_api.py b/lib/netsyms_api.py deleted file mode 100644 index dbae24e..0000000 --- a/lib/netsyms_api.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -RECON Netsyms API — Flask Blueprint. - -GET /api/netsyms/lookup?q=&country= -GET /api/netsyms/health -""" - -from flask import Blueprint, request, jsonify - -from . import netsyms -from .utils import setup_logging - -logger = setup_logging('recon.netsyms_api') - -netsyms_bp = Blueprint('netsyms', __name__) - - -@netsyms_bp.route('/api/netsyms/lookup') -def api_netsyms_lookup(): - q = request.args.get('q', '').strip() - if not q: - return jsonify({'error': 'Missing q parameter'}), 400 - - country = request.args.get('country', '').strip() or None - results = netsyms.lookup_free_text(q, country_hint=country) - return jsonify({'results': results, 'count': len(results), 'query': q}) - - -@netsyms_bp.route('/api/netsyms/health') -def api_netsyms_health(): - return jsonify(netsyms.health()) diff --git a/lib/netsyms_test.py b/lib/netsyms_test.py deleted file mode 100644 index ed70472..0000000 --- a/lib/netsyms_test.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for Netsyms address database module.""" - -import sys -import os - -# Ensure the lib directory is importable -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from lib import netsyms - - -def test_lookup_by_street_lowercase(): - results = netsyms.lookup_by_street("214", "North St", city="Filer", state="ID") - assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" - r = results[0] - assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}" - assert abs(r['lon'] - (-114.6066)) < 0.01, f"Lon mismatch: {r['lon']}" - print(" PASS: lookup_by_street (lowercase)") - - -def test_lookup_by_street_uppercase(): - results = netsyms.lookup_by_street("214", "NORTH ST", city="FILER", state="ID") - assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" - r = results[0] - assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}" - print(" PASS: lookup_by_street (uppercase)") - - -def test_lookup_nonexistent(): - results = netsyms.lookup_by_street("999999", "Nonexistent Rd", - city="Filer", state="ID") - assert results == [], f"Expected empty list, got {len(results)} results" - print(" PASS: lookup_by_street (nonexistent)") - - -def test_free_text_with_commas(): - results = netsyms.lookup_free_text("214 North St, Filer, ID") - assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" - r = results[0] - assert r['city'] == 'FILER', f"City mismatch: {r['city']}" - assert r['state'] == 'ID', f"State mismatch: {r['state']}" - print(" PASS: lookup_free_text (commas)") - - -def test_free_text_no_commas(): - results = netsyms.lookup_free_text("214 North St Filer ID") - assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}" - r = results[0] - assert r['state'] == 'ID', f"State mismatch: {r['state']}" - print(" PASS: lookup_free_text (no commas)") - - -def test_lookup_by_zipcode(): - results = netsyms.lookup_by_zipcode("83328", limit=5) - assert len(results) == 5, f"Expected 5 results, got {len(results)}" - for r in results: - assert r['zipcode'] == '83328', f"Zipcode mismatch: {r['zipcode']}" - print(" PASS: lookup_by_zipcode") - - -def test_health(): - h = netsyms.health() - assert h['ok'] is True, f"Health not OK: {h}" - assert h['row_count'] >= 159_000_000, f"Row count too low: {h['row_count']}" - assert 'US' in h['indexed_countries'], f"US not in countries: {h['indexed_countries']}" - assert 'CA' in h['indexed_countries'], f"CA not in countries: {h['indexed_countries']}" - print(" PASS: health") - - -if __name__ == '__main__': - print("Running Netsyms tests...") - test_lookup_by_street_lowercase() - test_lookup_by_street_uppercase() - test_lookup_nonexistent() - test_free_text_with_commas() - test_free_text_no_commas() - test_lookup_by_zipcode() - test_health() - print("All tests passed.") diff --git a/lib/processors/zim_processor.py b/lib/processors/zim_processor.py index 6f5c887..b258408 100644 --- a/lib/processors/zim_processor.py +++ b/lib/processors/zim_processor.py @@ -77,73 +77,10 @@ def _text_hash(text): return hashlib.md5(text.encode('utf-8')).hexdigest() -def _flatten_table(table_el): - """Convert a element to pipe-delimited text. - - Each becomes a row with cells joined by ' | '. - Returns the formatted table as a string with blank lines around it. - """ - rows = [] - for tr in table_el.iter('tr'): - cells = [] - for cell in tr: - if cell.tag in ('td', 'th'): - cell_text = (cell.text_content() or '').strip() - # Collapse internal whitespace in each cell - cell_text = re.sub(r'\s+', ' ', cell_text) - if cell_text: - cells.append(cell_text) - if cells: - rows.append(' | '.join(cells)) - if not rows: - return '' - return '\n'.join(rows) - - -def _preprocess_tree(doc): - """Pre-process HTML tree to add delimiters before text_content() flattens it. - - Handles:
,
,
  • ,
    ,
    -- elements that lxml's - text_content() would concatenate without any separators. - """ - from lxml import etree - - # 1. Replace
  • elements with their pipe-delimited text - for table in list(doc.iter('table')): - formatted = _flatten_table(table) - if formatted: - replacement = etree.Element('div') - replacement.text = '\n\n' + formatted + '\n\n' - parent = table.getparent() - if parent is not None: - parent.replace(table, replacement) - else: - table.drop_tree() - - # 2.
    -> inject newline - for br in list(doc.iter('br')): - br.tail = '\n' + (br.tail or '') - - # 3.
  • -> inject newline + "- " prefix - for li in list(doc.iter('li')): - li.text = '- ' + (li.text or '') - li.tail = '\n' + (li.tail or '') - - # 4.
    -> inject newline before - for dt in list(doc.iter('dt')): - dt.tail = '\n' + (dt.tail or '') - - # 5.
    -> inject newline + indent - for dd in list(doc.iter('dd')): - dd.text = ' ' + (dd.text or '') - dd.tail = '\n' + (dd.tail or '') - - def _html_to_text(html_bytes): """Convert HTML bytes to clean text via lxml. Strips nav, footer, script, style elements. Decodes entities. - Pre-processes tables, lists, and line breaks for proper delimiters. Normalizes whitespace. """ try: @@ -156,9 +93,6 @@ def _html_to_text(html_bytes): for el in doc.iter(tag): el.drop_tree() - # Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes - _preprocess_tree(doc) - # Extract text text = doc.text_content() diff --git a/lib/query_router.py b/lib/query_router.py deleted file mode 100644 index dda14a2..0000000 --- a/lib/query_router.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Semantic query router for Aurora. - -Classifies user queries into routes (nav_route, nav_reverse_geocode, -direct_answer, rag_search) by comparing query embeddings against -pre-computed route centroids from example queries. - -TEI endpoint: http://100.64.0.14:8090/embed (cortex via Tailscale) -""" - -import math -import threading -import requests - -# ── Route examples ──────────────────────────────────────────────────────────── -ROUTE_EXAMPLES = { - "nav_route": [ - "how do I get to Boise", - "directions to Twin Falls", - "how do I get from Buhl to Boise", - "drive from Jerome to Sun Valley", - "route from Boise to McCall", - "what's the fastest way to Sun Valley", - "how far is it to Twin Falls", - "take me to Shoshone", - "navigate to the airport", - "how do I drive to Salt Lake City", - "walking directions to the park", - "bike route to downtown", - ], - "nav_reverse_geocode": [ - "what town is at 42.5, -114.7", - "where am I right now", - "what is at coordinates 43.6, -116.2", - "what location is 42.574, -114.607", - "where is this place 44.0, -114.3", - "what city is near 42.7, -114.5", - "reverse geocode 43.0, -115.0", - "what's at this location 42.9, -114.8", - ], - "direct_answer": [ - "hello", - "hey aurora", - "good morning", - "thanks", - "thank you", - "what's your name", - "who are you", - "tell me a joke", - "how are you", - "hi there", - ], - "rag_search": [ - "what does the survival manual say about water", - "how to purify water in the field", - "how to treat a gunshot wound", - "what is the ranger handbook chapter on patrolling", - "field manual water purification", - "how to build a shelter in the wilderness", - "tactical combat casualty care procedures", - "what does FM 21-76 say about fire starting", - ], -} - -# ── Module-level cache ──────────────────────────────────────────────────────── -_ROUTE_CENTROIDS: dict | None = None -_LOCK = threading.Lock() - - -def _embed_batch(texts: list[str], tei_url: str) -> list[list[float]]: - """Embed a batch of texts via TEI.""" - resp = requests.post(tei_url, json={"inputs": texts}, timeout=30) - resp.raise_for_status() - return resp.json() - - -def _compute_centroid(vectors: list[list[float]]) -> list[float]: - """Element-wise mean of vectors.""" - n = len(vectors) - dim = len(vectors[0]) - centroid = [0.0] * dim - for vec in vectors: - for i in range(dim): - centroid[i] += vec[i] - for i in range(dim): - centroid[i] /= n - return centroid - - -def _cosine_similarity(a: list[float], b: list[float]) -> float: - """Cosine similarity between two vectors (pure Python).""" - dot = 0.0 - norm_a = 0.0 - norm_b = 0.0 - for i in range(len(a)): - dot += a[i] * b[i] - norm_a += a[i] * a[i] - norm_b += b[i] * b[i] - denom = math.sqrt(norm_a) * math.sqrt(norm_b) - if denom == 0: - return 0.0 - return dot / denom - - -def _ensure_centroids(tei_url: str) -> dict[str, list[float]]: - """Lazy-init: embed all examples in one batch, compute centroids, cache.""" - global _ROUTE_CENTROIDS - if _ROUTE_CENTROIDS is not None: - return _ROUTE_CENTROIDS - - with _LOCK: - if _ROUTE_CENTROIDS is not None: - return _ROUTE_CENTROIDS - - # Flatten all examples into one batch - all_texts = [] - route_ranges: dict[str, tuple[int, int]] = {} - offset = 0 - for route, examples in ROUTE_EXAMPLES.items(): - route_ranges[route] = (offset, offset + len(examples)) - all_texts.extend(examples) - offset += len(examples) - - all_vectors = _embed_batch(all_texts, tei_url) - - centroids = {} - for route, (start, end) in route_ranges.items(): - centroids[route] = _compute_centroid(all_vectors[start:end]) - - _ROUTE_CENTROIDS = centroids - return _ROUTE_CENTROIDS - - -def classify( - query: str, - tei_url: str = "http://100.64.0.14:8090/embed", - threshold: float = 0.45, -) -> tuple[str, float]: - """Classify a query into a route. - - Returns (route_name, confidence). If no route exceeds the threshold, - returns ("rag_search", best_score) as the safe default. - """ - centroids = _ensure_centroids(tei_url) - - # Embed the query - vecs = _embed_batch([query], tei_url) - query_vec = vecs[0] - - # Compare against all centroids - best_route = "rag_search" - best_score = 0.0 - for route, centroid in centroids.items(): - sim = _cosine_similarity(query_vec, centroid) - if sim > best_score: - best_score = sim - best_route = route - - if best_score < threshold: - return ("rag_search", best_score) - - return (best_route, best_score) diff --git a/lib/query_router_test.py b/lib/query_router_test.py deleted file mode 100644 index 27ccefd..0000000 --- a/lib/query_router_test.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -"""Test suite for the semantic query router.""" - -import sys -import os - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from lib.query_router import classify - -TEST_QUERIES = [ - ("how do I get from Buhl to Boise", "nav_route"), - ("what does the survival manual say about water", "rag_search"), - ("what town is at 42.5, -114.7", "nav_reverse_geocode"), - ("hey aurora", "direct_answer"), - ("what's the fastest way to Sun Valley", "nav_route"), - ("how to purify water in the field", "rag_search"), - ("good morning", "direct_answer"), -] - - -def main(): - print("Query Router Test Suite") - print("=" * 70) - - passed = 0 - failed = 0 - - for query, expected in TEST_QUERIES: - route, confidence = classify(query) - status = "PASS" if route == expected else "FAIL" - if status == "PASS": - passed += 1 - else: - failed += 1 - print(f" [{status}] {query!r}") - print(f" → {route} ({confidence:.3f}) expected={expected}") - - print("=" * 70) - print(f"Results: {passed}/{passed + failed} passed") - if failed: - print(f" {failed} FAILED") - sys.exit(1) - else: - print(" All tests passed!") - - -if __name__ == "__main__": - main() diff --git a/requirements.txt b/requirements.txt index 1da21bc..f643cd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ anyio==4.12.1 babel==2.18.0 beautifulsoup4==4.14.3 blinker==1.9.0 -cachetools==7.1.3 certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4