diff --git a/config/profiles/home.yaml b/config/profiles/home.yaml index 5269812..de704d9 100644 --- a/config/profiles/home.yaml +++ b/config/profiles/home.yaml @@ -6,13 +6,13 @@ profile: home region_name: "North America" tileset: - url: "/tiles/na.pmtiles" + url: "/tiles/planet/current.pmtiles" bounds: [-168, 14, -52, 72] max_zoom: 15 attribution: "Protomaps © OSM" tileset_hillshade: - url: "/tiles/hillshade-na.pmtiles" + url: "/tiles/planet-dem.pmtiles" encoding: "terrarium" max_zoom: 12 @@ -31,16 +31,20 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + features: has_nominatim_details: true - has_kiwix_wiki: false + has_kiwix_wiki: true has_hillshade: true has_3d_terrain: false has_traffic_overlay: true has_landclass: true has_public_lands_layer: true has_contours: true - has_contours_test: true + has_contours_test: false has_contours_test_10ft: false has_address_book_write: false has_overture_enrichment: true @@ -48,7 +52,16 @@ features: has_contacts: true has_wiki_rewriting: true has_wiki_discovery: false + has_usfs_trails: true + has_blm_trails: true defaults: center: [42.5736, -114.6066] zoom: 10 + +# Offroute wilderness routing +offroute: + osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf" + densify_interval_m: 100 + postgis_dsn: "dbname=padus" + diff --git a/config/profiles/minimal_pi.yaml b/config/profiles/minimal_pi.yaml index e3ae0fd..c2fd90a 100644 --- a/config/profiles/minimal_pi.yaml +++ b/config/profiles/minimal_pi.yaml @@ -26,6 +26,11 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" +# TODO(matt): confirm logout next= host for this profile +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + features: has_nominatim_details: false has_kiwix_wiki: false diff --git a/config/profiles/regional_pi.yaml b/config/profiles/regional_pi.yaml index 8e70cd6..b6f2cad 100644 --- a/config/profiles/regional_pi.yaml +++ b/config/profiles/regional_pi.yaml @@ -31,6 +31,11 @@ services: address_book: "/api/address_book" valhalla: "/valhalla" +# TODO(matt): confirm logout next= host for this profile +auth: + login_url: "/outpost.goauthentik.io/start?rd=%2F" + logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/" + features: has_nominatim_details: true has_kiwix_wiki: false diff --git a/lib/address_book.py b/lib/address_book.py deleted file mode 100644 index f9827f6..0000000 --- a/lib/address_book.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -RECON Address Book — YAML-backed saved-location lookup. - -Provides named locations (home, work, etc.) that short-circuit Photon -geocoding when an exact alias match is found. - -Config: /opt/recon/config/address_book.yaml -""" - -import os -import re -import threading - -import yaml - -from .utils import setup_logging - -logger = setup_logging('recon.address_book') - -_CONFIG_PATH = os.path.join( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - 'config', 'address_book.yaml', -) - -_lock = threading.Lock() -_entries: list[dict] = [] -_mtime: float = 0.0 - - -def _reload_if_changed(): - """Reload the YAML file if its mtime has changed.""" - global _entries, _mtime - try: - st = os.stat(_CONFIG_PATH) - except FileNotFoundError: - logger.warning("Address book not found: %s", _CONFIG_PATH) - _entries = [] - _mtime = 0.0 - return - - if st.st_mtime == _mtime: - return - - with _lock: - # Double-check after acquiring lock - try: - st = os.stat(_CONFIG_PATH) - except FileNotFoundError: - _entries = [] - _mtime = 0.0 - return - if st.st_mtime == _mtime: - return - - with open(_CONFIG_PATH, 'r') as f: - data = yaml.safe_load(f) or {} - - raw = data.get('entries', []) - loaded = [] - for entry in raw: - # Normalise aliases to lowercase for matching - aliases = [a.lower() for a in entry.get('aliases', [])] - loaded.append({ - 'id': entry.get('id', ''), - 'name': entry.get('name', ''), - 'aliases': aliases, - 'address': entry.get('address', ''), - 'lat': entry.get('lat'), - 'lon': entry.get('lon'), - 'tags': entry.get('tags', []), - }) - _entries = loaded - _mtime = st.st_mtime - logger.info("Address book loaded: %d entries from %s", len(_entries), _CONFIG_PATH) - - -def load(): - """Ensure the address book is loaded (and refreshed if the file changed).""" - _reload_if_changed() - return _entries - - -def _normalize(text: str) -> str: - """Lowercase, strip, remove commas, collapse whitespace.""" - t = text.strip().lower() - t = t.replace(',', ' ') - return ' '.join(t.split()) - - -def lookup(query: str): - """ - Look up a query against name and aliases. - - Returns dict with the matching entry plus a 'confidence' field: - - "exact": full name/alias match, OR query starts with alias + word boundary - - "partial": alias starts with query + word boundary, or alias appears - as a contiguous token sequence inside the query - - None if no match - - Matching order (first exact wins, else first partial): - 1. normalized(query) == normalized(name or alias) → exact - 2. normalized(query) starts with normalized(alias) + " " → exact - 3. normalized(alias) starts with normalized(query) + " " → partial - 4. normalized(alias) is a contiguous token sub-sequence → partial - """ - _reload_if_changed() - q = _normalize(query) - if not q: - return None - - first_exact = None - first_partial = None - - for entry in _entries: - norm_name = _normalize(entry['name']) - check_aliases = [_normalize(a) for a in entry.get('aliases', [])] - all_forms = [norm_name] + check_aliases - - for form in all_forms: - if not form: - continue - - # Rule 1: exact match - if q == form: - return {**entry, 'confidence': 'exact'} - - # Rule 2: query starts with alias + word boundary - if q.startswith(form + ' '): - if first_exact is None: - first_exact = entry - continue - - # Rule 3: alias starts with query (user still typing) - if form.startswith(q) and len(q) < len(form): - if first_partial is None: - first_partial = entry - continue - - # Rule 4: alias is contiguous token sub-sequence in query - # Build regex: token1\s+token2\s+...tokenN - tokens = form.split() - if len(tokens) >= 1: - pattern = r'(?:^|\s)' + r'\s+'.join(re.escape(t) for t in tokens) + r'(?:\s|$)' - if re.search(pattern, q): - if first_partial is None: - first_partial = entry - - if first_exact is not None: - return {**first_exact, 'confidence': 'exact'} - - if first_partial is not None: - return {**first_partial, 'confidence': 'partial'} - - return None - - -def list_all(): - """Return all address book entries.""" - _reload_if_changed() - return list(_entries) diff --git a/lib/address_book_api.py b/lib/address_book_api.py deleted file mode 100644 index 020828b..0000000 --- a/lib/address_book_api.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -RECON Address Book API — Flask Blueprint. - -GET /api/address_book/lookup?q= — best match or 404 -GET /api/address_book/list — all entries -""" - -from flask import Blueprint, request, jsonify - -from . import address_book - -address_book_bp = Blueprint('address_book', __name__) - - -@address_book_bp.route('/api/address_book/lookup') -def api_address_book_lookup(): - q = request.args.get('q', '').strip() - if not q: - return jsonify({'error': 'Missing q parameter'}), 400 - - result = address_book.lookup(q) - if result is None: - return '', 404 - - return jsonify(result) - - -@address_book_bp.route('/api/address_book/list') -def api_address_book_list(): - entries = address_book.list_all() - return jsonify(entries) diff --git a/lib/address_book_test.py b/lib/address_book_test.py deleted file mode 100644 index 75905f0..0000000 --- a/lib/address_book_test.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for RECON address book module.""" -import sys -import os - -# Add project root to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from lib import address_book - -TESTS = [ - # ── Existing tests ── - ("lookup('home') → exact", - lambda: address_book.lookup("home"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('Home') → exact (case-insensitive)", - lambda: address_book.lookup("Home"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('214 north st') → exact via alias", - lambda: address_book.lookup("214 north st"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('214 North Street') → exact via alias", - lambda: address_book.lookup("214 North Street"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('nonexistent place') → None", - lambda: address_book.lookup("nonexistent place"), - lambda r: r is None), - - ("list_all() → 1 entry", - lambda: address_book.list_all(), - lambda r: isinstance(r, list) and len(r) == 1 and r[0]['id'] == 'home'), - - # ── New prefix+boundary tests ── - ("lookup('214 north st filer') → exact (query starts with alias)", - lambda: address_book.lookup("214 north st filer"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('214 North St Filer ID') → exact (case + trailing state)", - lambda: address_book.lookup("214 North St Filer ID"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('214 north st, filer, id') → exact (commas stripped)", - lambda: address_book.lookup("214 north st, filer, id"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('home today') → exact (short alias + trailing text)", - lambda: address_book.lookup("home today"), - lambda r: r is not None and r['confidence'] == 'exact' and r['id'] == 'home'), - - ("lookup('214') → partial (query is prefix of alias)", - lambda: address_book.lookup("214"), - lambda r: r is not None and r['confidence'] == 'partial'), - - ("lookup('214 n') → partial (partial prefix of alias)", - lambda: address_book.lookup("214 n"), - lambda r: r is not None and r['confidence'] == 'partial'), - - ("lookup('completely unrelated query') → None", - lambda: address_book.lookup("completely unrelated query"), - lambda r: r is None), - - ("lookup('214 north streets of filer') → None (no word boundary after st)", - lambda: address_book.lookup("214 north streets of filer"), - lambda r: r is None), -] - -passed = 0 -failed = 0 -for name, fn, check in TESTS: - try: - result = fn() - ok = check(result) - except Exception as e: - ok = False - result = f"EXCEPTION: {e}" - - status = "PASS" if ok else "FAIL" - if ok: - passed += 1 - else: - failed += 1 - print(f" [{status}] {name}") - if not ok: - print(f" got: {result}") - -print(f"\n{passed} passed, {failed} failed") -sys.exit(0 if failed == 0 else 1) diff --git a/lib/api.py b/lib/api.py index 8a1f383..a0697bf 100644 --- a/lib/api.py +++ b/lib/api.py @@ -17,16 +17,13 @@ import shutil import tempfile import requests as http_requests -from flask import Flask, request, jsonify, redirect, render_template, make_response +from flask import Flask, request, jsonify, redirect, render_template from qdrant_client import QdrantClient from qdrant_client.models import Filter, FieldCondition, MatchValue from werkzeug.utils import secure_filename from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging from .status import StatusDB -from .deployment_config import get_deployment_config -from .place_detail import get_place_detail, get_place_by_wikidata -from .landclass import lookup_landclass, format_summary logger = setup_logging('recon.api') @@ -60,19 +57,9 @@ class _LargeZimRequest(_FlaskRequest): return super()._get_file_stream(total_content_length, content_type, filename, content_length) app.request_class = _LargeZimRequest -# ── Address Book Blueprint ── -from .address_book_api import address_book_bp -app.register_blueprint(address_book_bp) - -# ── Contacts Blueprint ── -from .contacts_api import contacts_bp -app.register_blueprint(contacts_bp) - -# ── Netsyms + Geocode Blueprints ── -from .netsyms_api import netsyms_bp, geocode_bp +# ── Netsyms Blueprint ── +from .netsyms_api import netsyms_bp app.register_blueprint(netsyms_bp) -app.register_blueprint(geocode_bp) - # ── Navigation Constants ── @@ -102,12 +89,6 @@ SETTINGS_SUBNAV = [ {'href': '/settings/health', 'label': 'Service Health'}, ] -NAVI_SUBNAV = [ - {'href': '/nav-i', 'label': 'Overview'}, - {'href': '/deleted-contacts', 'label': 'Deleted Contacts'}, - {'href': '/nav-i/api-keys', 'label': 'API Keys'}, -] - def _format_source_citation(payload): """Format a human-readable citation from a search result payload.""" @@ -334,36 +315,6 @@ def failures_page(): failures=failures) -@app.route("/deleted-contacts") -def deleted_contacts_page(): - from .auth import get_user_id - from .contacts import ContactsDB - user_id = get_user_id() or "anonymous" - db = ContactsDB() - contacts = db.list_deleted(user_id) - return render_template("navi/deleted_contacts.html", - domain="navi", subnav=NAVI_SUBNAV, active_page="/deleted-contacts", - contacts=contacts) - - -@app.route("/nav-i") -def navi_landing_page(): - from .auth import get_user_id - from .contacts import ContactsDB - user_id = get_user_id() or "anonymous" - db = ContactsDB() - deleted_count = len(db.list_deleted(user_id)) - return render_template("navi/landing.html", - domain="navi", subnav=NAVI_SUBNAV, active_page="/nav-i", - deleted_count=deleted_count) - - -@app.route("/nav-i/api-keys") -def navi_api_keys_page(): - return render_template("navi/api_keys.html", - domain="navi", subnav=NAVI_SUBNAV, active_page="/nav-i/api-keys") - - @app.route('/peertube') def peertube_dashboard(): return render_template('peertube/dashboard.html', @@ -1208,82 +1159,6 @@ def api_knowledge_stats(): return jsonify(_cache['knowledge_stats']) - -@app.route('/api/traffic/flow///.png') -def api_traffic_flow(z, x, y): - """Proxy TomTom traffic flow tiles to hide API key from frontend.""" - key = os.environ.get('TOMTOM_API_KEY') - if not key: - return 'Traffic service not configured', 503 - url = f'https://api.tomtom.com/traffic/map/4/tile/flow/relative/{z}/{x}/{y}.png?key={key}' - try: - resp = http_requests.get(url, timeout=10) - if resp.status_code != 200: - return 'Upstream error', 502 - r = make_response(resp.content) - r.headers['Content-Type'] = 'image/png' - r.headers['Cache-Control'] = 'public, max-age=120' - return r - except Exception: - return 'Upstream timeout', 504 - - -@app.route('/api/place//') -def api_place_detail(osm_type, osm_id): - """Proxy place details from local Nominatim or Overpass API.""" - result, status = get_place_detail(osm_type, osm_id) - return jsonify(result), status - - -@app.route("/api/place/wikidata/") -def api_place_wikidata(wikidata_id): - """Fetch place details from Wikidata entity.""" - result, status = get_place_by_wikidata(wikidata_id) - return jsonify(result), status - - - -@app.route('/api/landclass') -def api_landclass(): - """PAD-US land classification lookup for a point.""" - config = get_deployment_config() - if not config.get('features', {}).get('has_landclass'): - return jsonify({'error': 'Land classification not available'}), 404 - - try: - lat = float(request.args.get('lat', '')) - lon = float(request.args.get('lon', '')) - except (ValueError, TypeError): - return jsonify({'error': 'lat and lon required as numbers'}), 400 - - if not (-90 <= lat <= 90) or not (-180 <= lon <= 180): - return jsonify({'error': 'lat must be -90..90, lon must be -180..180'}), 400 - - classifications = lookup_landclass(lat, lon) - is_public = len(classifications) > 0 - is_private = len(classifications) == 0 - summary = format_summary(classifications) - - return jsonify({ - 'lat': lat, - 'lon': lon, - 'classifications': classifications, - 'count': len(classifications), - 'is_public': is_public, - 'is_private': is_private, - 'summary': summary, - }) - - -@app.route('/api/config') -def api_config(): - """Return deployment profile config for frontend consumption.""" - config = get_deployment_config() - resp = jsonify(config) - resp.headers['Cache-Control'] = 'public, max-age=300' - return resp - - @app.route('/api/health') def api_health(): """Health check endpoint for monitoring.""" @@ -1445,60 +1320,6 @@ def api_keys_reload(): -# ── Nav-I API Key Admin ── - -@app.route('/api/nav-i/api-keys/list', methods=['GET']) -def navi_api_keys_list(): - from .api_keys_admin import list_keys - return jsonify({'keys': list_keys()}) - - -@app.route('/api/nav-i/api-keys/update', methods=['POST']) -def navi_api_keys_update(): - from .auth import require_auth - from .api_keys_admin import update_key, update_gemini_key - data = request.get_json(force=True) - name = data.get('name', '') - new_value = data.get('new_value', '') - index = data.get('index') # optional, for Gemini key replacement - if not name or not new_value: - return jsonify({'error': 'name and new_value required'}), 400 - if name == 'GEMINI_KEY' and index is not None: - result = update_gemini_key(int(index), new_value) - else: - result = update_key(name, new_value) - if result.get('success'): - return jsonify(result) - return jsonify(result), 400 - - -@app.route('/api/nav-i/api-keys/test', methods=['POST']) -def navi_api_keys_test(): - from .api_keys_admin import test_key - data = request.get_json(force=True) - name = data.get('name', '') - index = data.get('index') # optional, for testing specific Gemini key - if not name: - return jsonify({'error': 'name required'}), 400 - result = test_key(name, index=int(index) if index is not None else None) - return jsonify(result) - - -@app.route('/api/nav-i/api-keys/restart-recon', methods=['POST']) -def navi_api_keys_restart(): - import subprocess - try: - result = subprocess.run( - ['sudo', 'systemctl', 'restart', 'recon'], - capture_output=True, text=True, timeout=30 - ) - if result.returncode == 0: - return jsonify({'success': True, 'note': 'RECON service restarted'}) - return jsonify({'success': False, 'error': result.stderr.strip()}), 500 - except subprocess.TimeoutExpired: - return jsonify({'success': False, 'error': 'Restart timed out'}), 500 - except Exception as e: - return jsonify({'success': False, 'error': str(e)}), 500 # ── YouTube Cookie Management ── @@ -2704,21 +2525,3 @@ def api_metrics_history(): return jsonify({'type': metric_type, 'hours': hours, 'points': points}) except Exception as e: return jsonify({'type': metric_type, 'hours': hours, 'points': [], 'error': str(e)}) - - -# ── Auth state endpoint ───────────────────────────────────────────────────── -# Returns current auth state for frontend consumption. -# This endpoint must be behind Caddy forward_auth to receive X-Authentik-* headers. -@app.route('/api/auth/whoami') -def api_auth_whoami(): - """Return auth state for frontend. Behind forward_auth, so headers are present when authenticated.""" - username = request.headers.get('X-Authentik-Username') - if username: - return jsonify({ - 'authenticated': True, - 'username': username, - }) - return jsonify({ - 'authenticated': False, - 'username': None, - }) diff --git a/lib/api_keys_admin.py b/lib/api_keys_admin.py deleted file mode 100644 index 3c63565..0000000 --- a/lib/api_keys_admin.py +++ /dev/null @@ -1,358 +0,0 @@ -""" -Nav-I API Keys Admin — unified view/update/test for third-party API keys. - -Manages three provider categories: - - Gemini (multiple keys via KeyManager singleton) - - TomTom (single key in .env) - - Google Places (single key in .env) - -All key values are masked in responses. Full values never leave the server -except as user-supplied input on update. -""" -import os -import re -import shutil -import tempfile -import time - -import requests as http_requests - -from .utils import setup_logging - -logger = setup_logging('recon.api_keys_admin') - -ENV_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '.env') - -# Key definitions: env_name → display metadata -_KEY_DEFS = { - 'TOMTOM_API_KEY': { - 'display_name': 'TomTom', - 'provider': 'tomtom', - }, - 'GOOGLE_PLACES_API_KEY': { - 'display_name': 'Google Places', - 'provider': 'google_places', - }, -} - - -# ── .env read/write helpers ───────────────────────────────────────────── - -def _read_env(): - """Read .env file into a dict of key=value pairs, preserving order.""" - entries = [] # list of (key, value, raw_line) — preserves order and comments - if not os.path.exists(ENV_PATH): - return entries - with open(ENV_PATH, 'r') as f: - for line in f: - raw = line.rstrip('\n') - stripped = raw.strip() - if not stripped or stripped.startswith('#'): - entries.append((None, None, raw)) - continue - m = re.match(r'^([A-Za-z_][A-Za-z0-9_]*)=(.*)$', stripped) - if m: - entries.append((m.group(1), m.group(2).strip().strip('"').strip("'"), raw)) - else: - entries.append((None, None, raw)) - return entries - - -def _write_env(entries): - """Atomically write .env from entries list. Backs up to .env.bak first.""" - # Backup current .env - if os.path.exists(ENV_PATH): - bak_path = ENV_PATH + '.bak' - shutil.copy2(ENV_PATH, bak_path) - - # Write to temp file, then rename (atomic on same filesystem) - fd, tmp_path = tempfile.mkstemp(dir=os.path.dirname(ENV_PATH), prefix='.env.', suffix='.tmp') - try: - with os.fdopen(fd, 'w') as f: - for key, value, raw in entries: - if key is not None: - f.write(f'{key}={value}\n') - else: - f.write(raw + '\n') - os.rename(tmp_path, ENV_PATH) - except Exception: - # Clean up temp file on failure - try: - os.unlink(tmp_path) - except OSError: - pass - raise - - logger.info(f"Wrote .env atomically ({len([e for e in entries if e[0]])} keys)") - - -def _get_env_value(name): - """Get a single value from .env by key name.""" - for key, value, _ in _read_env(): - if key == name: - return value - return None - - -def _set_env_value(name, new_value): - """Set a single value in .env. Adds if not present.""" - entries = _read_env() - found = False - for i, (key, value, raw) in enumerate(entries): - if key == name: - entries[i] = (name, new_value, f'{name}={new_value}') - found = True - break - if not found: - entries.append((name, new_value, f'{name}={new_value}')) - _write_env(entries) - - -# ── Masking ───────────────────────────────────────────────────────────── - -def _mask_key(value): - """Mask a key: first 4 chars + '...' + last 4 chars. Never return full value.""" - if not value: - return None - if len(value) <= 8: - return '****' - return value[:4] + '...' + value[-4:] - - -# ── List ──────────────────────────────────────────────────────────────── - -def list_keys(): - """ - Return masked status of all managed API keys. - - Returns list of dicts with: name, display_name, provider, masked_value, - is_set, count (for multi-key providers like Gemini). - """ - result = [] - env_mtime = None - if os.path.exists(ENV_PATH): - env_mtime = time.strftime('%Y-%m-%dT%H:%M:%SZ', - time.gmtime(os.path.getmtime(ENV_PATH))) - - # Gemini keys (via KeyManager) - from .key_manager import get_key_manager - km = get_key_manager() - gemini_keys = km.get_masked_keys() - gemini_count = len(gemini_keys) - # Show a single summary entry for Gemini with count - first_masked = gemini_keys[0]['masked'] if gemini_keys else None - result.append({ - 'name': 'GEMINI_KEY', - 'display_name': 'Gemini', - 'provider': 'gemini', - 'masked_value': first_masked, - 'is_set': gemini_count > 0, - 'count': gemini_count, - 'last_modified': env_mtime, - 'keys': gemini_keys, # full list with per-key stats - }) - - # Single-value keys - for env_name, meta in _KEY_DEFS.items(): - value = _get_env_value(env_name) - result.append({ - 'name': env_name, - 'display_name': meta['display_name'], - 'provider': meta['provider'], - 'masked_value': _mask_key(value), - 'is_set': bool(value), - 'count': 1 if value else 0, - 'last_modified': env_mtime, - }) - - return result - - -# ── Update ────────────────────────────────────────────────────────────── - -def update_key(name, new_value): - """ - Update a key value. For Gemini, name should be 'GEMINI_KEY' with an - optional 'index' for replacing a specific key, or use the KeyManager API. - For TomTom/Google Places, writes directly to .env. - - Returns dict with success status and masked value. - """ - new_value = new_value.strip() - if not new_value: - return {'success': False, 'error': 'Key value cannot be empty'} - - if name == 'GEMINI_KEY': - # Use KeyManager for Gemini - from .key_manager import get_key_manager - km = get_key_manager() - try: - idx = km.add_gemini_key(new_value) - return { - 'success': True, - 'name': name, - 'masked_value': _mask_key(new_value), - 'action': 'added', - 'index': idx, - } - except ValueError as e: - return {'success': False, 'error': str(e)} - - if name in _KEY_DEFS: - _set_env_value(name, new_value) - return { - 'success': True, - 'name': name, - 'masked_value': _mask_key(new_value), - 'action': 'updated', - } - - return {'success': False, 'error': f'Unknown key: {name}'} - - -def update_gemini_key(index, new_value): - """Replace a specific Gemini key by index.""" - new_value = new_value.strip() - if not new_value: - return {'success': False, 'error': 'Key value cannot be empty'} - - from .key_manager import get_key_manager - km = get_key_manager() - try: - km.replace_gemini_key(index, new_value) - return { - 'success': True, - 'name': 'GEMINI_KEY', - 'index': index, - 'masked_value': _mask_key(new_value), - 'action': 'replaced', - } - except (ValueError, IndexError) as e: - return {'success': False, 'error': str(e)} - - -# ── Test ──────────────────────────────────────────────────────────────── - -def test_key(name, index=None): - """ - Test a key against its provider API using the current .env value. - - Returns dict with: success, latency_ms, error, note. - """ - if name == 'GEMINI_KEY': - return _test_gemini(index) - elif name == 'TOMTOM_API_KEY': - return _test_tomtom() - elif name == 'GOOGLE_PLACES_API_KEY': - return _test_google_places() - else: - return {'success': False, 'error': f'Unknown key: {name}', 'latency_ms': 0} - - -def _test_gemini(index=None): - """Test Gemini key by listing models.""" - from .key_manager import get_key_manager - km = get_key_manager() - - if index is not None: - key = km.get_gemini_key(index) - if not key: - return {'success': False, 'error': f'Gemini key index {index} not found', 'latency_ms': 0} - else: - key = km.get_gemini_key(0) - if not key: - return {'success': False, 'error': 'No Gemini keys configured', 'latency_ms': 0} - - t0 = time.time() - try: - resp = http_requests.get( - f"https://generativelanguage.googleapis.com/v1beta/models?key={key}", - timeout=10 - ) - latency = int((time.time() - t0) * 1000) - - if resp.status_code == 200 and 'models' in resp.text: - return {'success': True, 'latency_ms': latency, 'error': None, - 'note': 'Models list returned successfully'} - elif resp.status_code == 403: - return {'success': False, 'latency_ms': latency, - 'error': 'Key disabled or quota exhausted'} - elif resp.status_code == 429: - return {'success': True, 'latency_ms': latency, 'error': None, - 'note': 'Valid key — currently rate-limited'} - else: - return {'success': False, 'latency_ms': latency, - 'error': f'HTTP {resp.status_code}'} - except Exception as e: - latency = int((time.time() - t0) * 1000) - return {'success': False, 'latency_ms': latency, 'error': str(e)} - - -def _test_tomtom(): - """Test TomTom key with a minimal geocode request.""" - key = _get_env_value('TOMTOM_API_KEY') - if not key: - return {'success': False, 'error': 'TOMTOM_API_KEY not set', 'latency_ms': 0} - - t0 = time.time() - try: - resp = http_requests.get( - f"https://api.tomtom.com/search/2/geocode/Boise.json", - params={'key': key, 'limit': 1}, - timeout=10 - ) - latency = int((time.time() - t0) * 1000) - - if resp.status_code == 200: - data = resp.json() - count = data.get('summary', {}).get('totalResults', 0) - return {'success': True, 'latency_ms': latency, 'error': None, - 'note': f'Geocode returned {count} result(s)'} - elif resp.status_code == 403: - return {'success': False, 'latency_ms': latency, - 'error': 'Invalid or expired key'} - else: - return {'success': False, 'latency_ms': latency, - 'error': f'HTTP {resp.status_code}'} - except Exception as e: - latency = int((time.time() - t0) * 1000) - return {'success': False, 'latency_ms': latency, 'error': str(e)} - - -def _test_google_places(): - """Test Google Places (New) API key with a minimal searchText request.""" - key = _get_env_value('GOOGLE_PLACES_API_KEY') - if not key: - return {'success': False, 'error': 'GOOGLE_PLACES_API_KEY not set', 'latency_ms': 0} - - t0 = time.time() - try: - resp = http_requests.post( - "https://places.googleapis.com/v1/places:searchText", - json={'textQuery': 'Boise Idaho', 'maxResultCount': 1}, - headers={ - 'X-Goog-Api-Key': key, - 'X-Goog-FieldMask': 'places.displayName', - }, - timeout=10 - ) - latency = int((time.time() - t0) * 1000) - - if resp.status_code == 200: - data = resp.json() - count = len(data.get('places', [])) - return {'success': True, 'latency_ms': latency, 'error': None, - 'note': f'searchText returned {count} place(s)'} - elif resp.status_code == 403: - return {'success': False, 'latency_ms': latency, - 'error': 'Key not authorized for Places API (New)'} - elif resp.status_code == 429: - return {'success': True, 'latency_ms': latency, 'error': None, - 'note': 'Valid key — quota exceeded'} - else: - body = resp.text[:200] - return {'success': False, 'latency_ms': latency, - 'error': f'HTTP {resp.status_code}: {body}'} - except Exception as e: - latency = int((time.time() - t0) * 1000) - return {'success': False, 'latency_ms': latency, 'error': str(e)} diff --git a/lib/contacts.py b/lib/contacts.py deleted file mode 100644 index f2782db..0000000 --- a/lib/contacts.py +++ /dev/null @@ -1,230 +0,0 @@ -""" -RECON Contacts Database — per-user phone book with soft delete and proximity queries. - -Separate DB at data/contacts.db. Thread-local connections with WAL mode (StatusDB pattern). -""" -import math -import os -import sqlite3 -import threading -from datetime import datetime, timezone - -_local = threading.local() - -_SCHEMA = """ -CREATE TABLE IF NOT EXISTS contacts ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - user_id TEXT NOT NULL, - label TEXT NOT NULL, - name TEXT, - call_sign TEXT, - phone TEXT, - email TEXT, - category TEXT, - notes TEXT, - lat REAL, - lon REAL, - osm_type TEXT, - osm_id INTEGER, - address TEXT, - show_proximity INTEGER DEFAULT 0, - created_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), - updated_at TEXT DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now')), - deleted_at TEXT, - deleted_by TEXT -); - -CREATE INDEX IF NOT EXISTS idx_contacts_user ON contacts(user_id); -CREATE INDEX IF NOT EXISTS idx_contacts_user_category ON contacts(user_id, category); -CREATE INDEX IF NOT EXISTS idx_contacts_user_deleted ON contacts(user_id, deleted_at); -CREATE INDEX IF NOT EXISTS idx_contacts_geo ON contacts(lat, lon); -CREATE UNIQUE INDEX IF NOT EXISTS idx_contacts_home_work - ON contacts(user_id, label) - WHERE label IN ('Home', 'Work') AND deleted_at IS NULL; -""" - - -def _haversine_m(lat1, lon1, lat2, lon2): - """Haversine distance in meters.""" - R = 6_371_000 - rlat1, rlat2 = math.radians(lat1), math.radians(lat2) - dlat = math.radians(lat2 - lat1) - dlon = math.radians(lon2 - lon1) - a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 - return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) - - -def _row_to_dict(row): - """Convert sqlite3.Row to dict, casting show_proximity to bool.""" - d = dict(row) - d['show_proximity'] = bool(d.get('show_proximity', 0)) - return d - - -class ContactsDB: - def __init__(self, db_path=None): - if db_path is None: - db_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'contacts.db') - self.db_path = db_path - os.makedirs(os.path.dirname(db_path), exist_ok=True) - self._init_db() - - def _get_conn(self): - if not hasattr(_local, 'contacts_conn') or _local.contacts_conn is None: - _local.contacts_conn = sqlite3.connect(self.db_path, timeout=30) - _local.contacts_conn.row_factory = sqlite3.Row - _local.contacts_conn.execute("PRAGMA journal_mode=WAL") - _local.contacts_conn.execute("PRAGMA busy_timeout=5000") - return _local.contacts_conn - - def _init_db(self): - conn = self._get_conn() - conn.executescript(_SCHEMA) - conn.commit() - - def list_all(self, user_id, category=None, search=None): - conn = self._get_conn() - sql = "SELECT * FROM contacts WHERE user_id = ? AND deleted_at IS NULL" - params = [user_id] - if category: - sql += " AND category = ?" - params.append(category) - if search: - sql += " AND (label LIKE ? OR name LIKE ? OR call_sign LIKE ? OR phone LIKE ?)" - like = f"%{search}%" - params.extend([like, like, like, like]) - sql += " ORDER BY label" - return [_row_to_dict(r) for r in conn.execute(sql, params).fetchall()] - - def list_deleted(self, user_id): - conn = self._get_conn() - rows = conn.execute( - "SELECT * FROM contacts WHERE user_id = ? AND deleted_at IS NOT NULL ORDER BY deleted_at DESC", - (user_id,) - ).fetchall() - return [_row_to_dict(r) for r in rows] - - def get(self, user_id, contact_id, include_deleted=False): - conn = self._get_conn() - sql = "SELECT * FROM contacts WHERE id = ? AND user_id = ?" - if not include_deleted: - sql += " AND deleted_at IS NULL" - row = conn.execute(sql, (contact_id, user_id)).fetchone() - return _row_to_dict(row) if row else None - - def create(self, user_id, **fields): - conn = self._get_conn() - fields.pop('id', None) - fields.pop('user_id', None) - fields.pop('created_at', None) - fields.pop('updated_at', None) - fields.pop('deleted_at', None) - fields.pop('deleted_by', None) - if 'show_proximity' in fields: - fields['show_proximity'] = 1 if fields['show_proximity'] else 0 - columns = ['user_id'] + list(fields.keys()) - placeholders = ', '.join(['?'] * len(columns)) - col_str = ', '.join(columns) - values = [user_id] + list(fields.values()) - try: - cur = conn.execute(f"INSERT INTO contacts ({col_str}) VALUES ({placeholders})", values) - conn.commit() - return self.get(user_id, cur.lastrowid), None - except sqlite3.IntegrityError: - return None, 'conflict' - - def update(self, user_id, contact_id, **fields): - conn = self._get_conn() - fields.pop('id', None) - fields.pop('user_id', None) - fields.pop('created_at', None) - fields.pop('deleted_at', None) - fields.pop('deleted_by', None) - if 'show_proximity' in fields: - fields['show_proximity'] = 1 if fields['show_proximity'] else 0 - fields['updated_at'] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ') - sets = ', '.join(f"{k} = ?" for k in fields) - values = list(fields.values()) + [contact_id, user_id] - conn.execute(f"UPDATE contacts SET {sets} WHERE id = ? AND user_id = ? AND deleted_at IS NULL", values) - conn.commit() - return self.get(user_id, contact_id) - - def soft_delete(self, user_id, contact_id): - conn = self._get_conn() - now = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%fZ') - conn.execute( - "UPDATE contacts SET deleted_at = ?, deleted_by = ? WHERE id = ? AND user_id = ? AND deleted_at IS NULL", - (now, user_id, contact_id, user_id) - ) - conn.commit() - return self.get(user_id, contact_id, include_deleted=True) - - def restore(self, user_id, contact_id): - conn = self._get_conn() - row = self.get(user_id, contact_id, include_deleted=True) - if not row or not row.get('deleted_at'): - return None, 'not_found' - if row.get('label') in ('Home', 'Work'): - existing = conn.execute( - "SELECT id FROM contacts WHERE user_id = ? AND label = ? AND deleted_at IS NULL AND id != ?", - (user_id, row['label'], contact_id) - ).fetchone() - if existing: - return None, 'conflict' - conn.execute( - "UPDATE contacts SET deleted_at = NULL, deleted_by = NULL WHERE id = ? AND user_id = ?", - (contact_id, user_id) - ) - conn.commit() - return self.get(user_id, contact_id), None - - def restore_as(self, user_id, contact_id, new_label): - """Restore a soft-deleted contact with a new label (for Home/Work conflict resolution).""" - conn = self._get_conn() - row = self.get(user_id, contact_id, include_deleted=True) - if not row or not row.get('deleted_at'): - return None, 'not_found' - if not new_label or not new_label.strip(): - return None, 'invalid_label' - now = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%fZ') - try: - conn.execute( - "UPDATE contacts SET deleted_at = NULL, deleted_by = NULL, label = ?, updated_at = ? WHERE id = ? AND user_id = ?", - (new_label.strip(), now, contact_id, user_id) - ) - conn.commit() - except sqlite3.IntegrityError: - return None, 'conflict' - return self.get(user_id, contact_id), None - - def purge(self, user_id, contact_id): - conn = self._get_conn() - row = self.get(user_id, contact_id, include_deleted=True) - if not row: - return False, 'not_found' - if not row.get('deleted_at'): - return False, 'not_deleted' - conn.execute("DELETE FROM contacts WHERE id = ? AND user_id = ?", (contact_id, user_id)) - conn.commit() - return True, None - - def find_nearby(self, user_id, lat, lon, radius_m=75): - conn = self._get_conn() - # Bounding box pre-filter (~111km per degree lat) - dlat = radius_m / 111_000 - dlon = radius_m / (111_000 * math.cos(math.radians(lat))) - rows = conn.execute( - """SELECT * FROM contacts - WHERE user_id = ? AND deleted_at IS NULL AND show_proximity = 1 - AND lat BETWEEN ? AND ? AND lon BETWEEN ? AND ?""", - (user_id, lat - dlat, lat + dlat, lon - dlon, lon + dlon) - ).fetchall() - results = [] - for r in rows: - dist = _haversine_m(lat, lon, r['lat'], r['lon']) - if dist <= radius_m: - d = _row_to_dict(r) - d['distance_m'] = round(dist, 1) - results.append(d) - results.sort(key=lambda x: x['distance_m']) - return results diff --git a/lib/contacts_api.py b/lib/contacts_api.py deleted file mode 100644 index 0e4506b..0000000 --- a/lib/contacts_api.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -RECON Contacts API — Flask Blueprint. - -Per-user phone book with soft delete, restore, purge, and proximity queries. -All endpoints require Authentik forward-auth (X-Authentik-Username header). -""" -from flask import Blueprint, request, jsonify - -from .auth import require_auth -from .contacts import ContactsDB - -contacts_bp = Blueprint('contacts', __name__) - -_db = None - -def _get_db(): - global _db - if _db is None: - _db = ContactsDB() - return _db - - -@contacts_bp.route('/api/contacts', methods=['GET']) -@require_auth -def list_contacts(): - db = _get_db() - category = request.args.get('category') - search = request.args.get('search') - return jsonify(db.list_all(request.user_id, category=category, search=search)) - - -@contacts_bp.route('/api/contacts', methods=['POST']) -@require_auth -def create_contact(): - db = _get_db() - data = request.get_json(force=True) - contact, err = db.create(request.user_id, **data) - if err == 'conflict': - return jsonify({'error': 'You already have a Home/Work contact'}), 409 - return jsonify(contact), 201 - - -@contacts_bp.route('/api/contacts/nearby', methods=['GET']) -@require_auth -def nearby_contacts(): - db = _get_db() - lat = request.args.get('lat', type=float) - lon = request.args.get('lon', type=float) - radius_m = request.args.get('radius_m', 75, type=float) - if lat is None or lon is None: - return jsonify({'error': 'lat and lon required'}), 400 - return jsonify(db.find_nearby(request.user_id, lat, lon, radius_m)) - - -@contacts_bp.route('/api/contacts/deleted', methods=['GET']) -@require_auth -def list_deleted(): - db = _get_db() - return jsonify(db.list_deleted(request.user_id)) - - -@contacts_bp.route('/api/contacts/', methods=['GET']) -@require_auth -def get_contact(contact_id): - db = _get_db() - contact = db.get(request.user_id, contact_id) - if not contact: - return jsonify({'error': 'Not found'}), 404 - return jsonify(contact) - - -@contacts_bp.route('/api/contacts/', methods=['PATCH']) -@require_auth -def update_contact(contact_id): - db = _get_db() - data = request.get_json(force=True) - contact = db.update(request.user_id, contact_id, **data) - if not contact: - return jsonify({'error': 'Not found'}), 404 - return jsonify(contact) - - -@contacts_bp.route('/api/contacts/', methods=['DELETE']) -@require_auth -def delete_contact(contact_id): - db = _get_db() - contact = db.soft_delete(request.user_id, contact_id) - if not contact: - return jsonify({'error': 'Not found'}), 404 - return jsonify(contact) - - -@contacts_bp.route('/api/contacts//restore', methods=['POST']) -@require_auth -def restore_contact(contact_id): - db = _get_db() - contact, err = db.restore(request.user_id, contact_id) - if err == 'not_found': - return jsonify({'error': 'Not found'}), 404 - if err == 'conflict': - return jsonify({'error': 'You already have a Home/Work contact'}), 409 - return jsonify(contact) - - -@contacts_bp.route('/api/contacts//restore-as', methods=['POST']) -@require_auth -def restore_as_contact(contact_id): - db = _get_db() - data = request.get_json(force=True) - new_label = data.get('label', '').strip() - if not new_label: - return jsonify({'error': 'label is required'}), 400 - contact, err = db.restore_as(request.user_id, contact_id, new_label) - if err == 'not_found': - return jsonify({'error': 'Not found'}), 404 - if err == 'invalid_label': - return jsonify({'error': 'Invalid label'}), 400 - if err == 'conflict': - return jsonify({'error': 'Label conflict'}), 409 - return jsonify(contact) - - -@contacts_bp.route('/api/contacts//purge', methods=['DELETE']) -@require_auth -def purge_contact(contact_id): - db = _get_db() - ok, err = db.purge(request.user_id, contact_id) - if err == 'not_found': - return jsonify({'error': 'Not found'}), 404 - if err == 'not_deleted': - return jsonify({'error': 'Contact must be deleted before purging'}), 400 - return jsonify({'ok': True}) diff --git a/lib/deployment_config.py b/lib/deployment_config.py index 978b8a0..ab6aa17 100644 --- a/lib/deployment_config.py +++ b/lib/deployment_config.py @@ -3,7 +3,15 @@ Deployment profile loader. Reads RECON_PROFILE env var (default: "home"), loads the matching YAML from config/profiles/.yaml, and caches the parsed dict in memory. -Provides get_deployment_config() for use by the /api/config endpoint. + +Exposes get_deployment_config() as the in-process accessor for the profile. + +Note: its former consumers (the /api/landclass gate, google_places, +place_detail, offroute/router) were all extracted to navi-* services or removed +across cleanups #4–#6/#27 — recon has no remaining caller of +get_deployment_config() today; the module is retained per cleanup #1. +(The former /api/config HTTP endpoint that served this dict to the frontend was +removed once navi-config (:8422) took over that route.) """ import os import yaml diff --git a/lib/extractor.py b/lib/extractor.py index 13159c9..bc236ab 100644 --- a/lib/extractor.py +++ b/lib/extractor.py @@ -21,6 +21,7 @@ Config: processing.extract_workers, processing.max_pdf_size_mb, processing.extract_timeout, processing.page_timeout """ import base64 +import re import json import os import random @@ -99,6 +100,40 @@ def _is_transient(error_str): return any(sig in s for sig in transient_signals) +def _text_quality_ok(text, min_length=50): + """Check if extracted text meets quality thresholds. + + Beyond the basic length check, validates: + - Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars) + - Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count + + Returns True if text passes all checks. + """ + text = text.strip() + if len(text) < min_length: + return False + + words = text.split() + if not words: + return False + + # Word-like ratio: tokens with 2+ alphabetic characters + word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2) + word_ratio = word_like / len(words) + if word_ratio < 0.60: + return False + + # Concatenation detector: lowercase immediately followed by uppercase + # Filter out common camelCase patterns in code (short tokens) + concat_hits = len(re.findall(r'[a-z][A-Z]', text)) + concat_ratio = concat_hits / len(words) if words else 0 + if concat_ratio > 0.10: + return False + + return True + + + def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30): """Render a single PDF page to PNG bytes using pdftoppm. @@ -224,7 +259,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # Method 1: pdftotext (poppler) try: result = subprocess.run( - ['pdftotext', '-f', str(page_num_0indexed + 1), + ['pdftotext', '-layout', '-f', str(page_num_0indexed + 1), '-l', str(page_num_0indexed + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) @@ -233,7 +268,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pdftotext' # Method 2: pdftoppm + Tesseract OCR @@ -258,7 +293,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'tesseract' # Method 3: Gemini Vision (last resort) @@ -276,8 +311,26 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30): # ── Core extraction functions ── def _pypdf2_extract(reader, page_num): - """Extract text from a PyPDF2 page object. Runs inside a thread for timeout.""" - return reader.pages[page_num].extract_text() or '' + """Extract text from a PyPDF2 page object. Runs inside a thread for timeout. + + Tries default extraction first (space_width=200). If quality check fails, + retries with space_width=100 which better detects word boundaries in + tightly-kerned PDFs (common in Haynes/workshop manuals). + + Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter + controls word-boundary detection tolerance. Lower values = more aggressive + space insertion between characters. + """ + text = reader.pages[page_num].extract_text() or '' + if _text_quality_ok(text): + return text + + # Retry with tighter word-boundary detection + text_tight = reader.pages[page_num].extract_text(space_width=100.0) or '' + if len(text_tight.strip()) >= len(text.strip()): + return text_tight + + return text def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): @@ -302,13 +355,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: text = '' - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pypdf2' # Method 2: pdftotext via subprocess (inherently timeout-safe) try: result = subprocess.run( - ['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], + ['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], capture_output=True, text=True, timeout=page_timeout ) if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()): @@ -316,7 +369,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'pdftotext' # Method 3: pdftoppm + Tesseract OCR @@ -340,7 +393,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): except Exception: pass - if len(text.strip()) >= 50: + if _text_quality_ok(text): return text, 'tesseract' # Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs) diff --git a/lib/geocode.py b/lib/geocode.py deleted file mode 100644 index aabd37e..0000000 --- a/lib/geocode.py +++ /dev/null @@ -1,774 +0,0 @@ -""" -RECON geocode — structured preprocessing, multi-source retrieval, reranking. - -Replaces the naive Photon-only search with: - 1. usaddress parsing + intent classification (ADDRESS / POI / LOCALITY / COORD / POSTCODE) - 2. Multi-source retrieval: ADDRESS → Netsyms + Photon; POI/LOCALITY → Photon /api - 3. Python reranker with weighted signals - -Public entry point: geocode(query, limit) → {query, results, count} -""" - -import math -import re -import logging - -import requests -import usaddress -from rapidfuzz import fuzz - -from .utils import setup_logging - -logger = setup_logging('recon.geocode') - -# ── Trace logger for reranking audit ── -_trace_logger = logging.getLogger('recon.geocode.trace') -_trace_handler = logging.FileHandler('/tmp/geocode_rerank_trace.log') -_trace_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) -_trace_logger.addHandler(_trace_handler) -_trace_logger.setLevel(logging.DEBUG) - -# ── Config constants ── -PHOTON_URL = "http://localhost:2322" -GEOCODE_BIAS_LAT = 42.5736 -GEOCODE_BIAS_LON = -114.6066 -GEOCODE_BIAS_ZOOM = 10 -ADDRESS_BOOK_ANNOTATION_RADIUS_M = 75 - -# ── Reranker weights ── -# Derived from research analysis of failure modes: -# housenumber_exact is the strongest signal because Photon's soft-boost -# lets wrong-number results bubble up. street_name_fuzz and locality_fuzz -# handle abbreviation/case variation. source_authority gives Netsyms a -# boost for US addresses since it has USPS-verified data. -W_HOUSENUMBER_EXACT = 6.0 # exact housenumber match -W_HOUSENUMBER_MISMATCH = -5.0 # housenumber present but wrong -W_STREET_NAME_FUZZ = 3.0 # fuzzy street name similarity [0..1] * weight -W_TOKEN_COVERAGE = 2.0 # fraction of query tokens found in result -W_STREET_TYPE_MATCH = 1.5 # "st" matches "street", etc. -W_LOCALITY_FUZZ = 2.0 # city/state fuzzy match -W_SOURCE_AUTHORITY = 2.0 # Netsyms for US addresses -W_LAYER_RANK = 1.0 # type-appropriate results ranked higher -W_PHOTON_POSITION_NORM = 1.0 # Photon's native ranking (normalized by position) -W_STATE_EXACT = 1.0 # exact state code match -W_POI_CLASS_BOOST = 3.0 # amenity/shop/etc boost for business-name queries -W_HIGHWAY_CLASS_PENALTY = -4.0 # highway/route penalty for business-name queries - -# ── US abbreviation expansions ── -# Applied ONLY to parsed StreetName/StreetNamePostType tokens, NOT to ordinals. -_STREET_TYPE_ABBREVS = { - 'st': 'street', 'ave': 'avenue', 'blvd': 'boulevard', 'dr': 'drive', - 'rd': 'road', 'ln': 'lane', 'ct': 'court', 'cir': 'circle', - 'pl': 'place', 'way': 'way', 'pkwy': 'parkway', 'hwy': 'highway', - 'trl': 'trail', 'ter': 'terrace', 'sq': 'square', -} -_DIRECTIONAL_ABBREVS = { - 'n': 'north', 's': 'south', 'e': 'east', 'w': 'west', - 'ne': 'northeast', 'nw': 'northwest', 'se': 'southeast', 'sw': 'southwest', -} -_ORDINAL_RE = re.compile(r'^\d+(st|nd|rd|th)$', re.IGNORECASE) - -# ── Road keywords (for detecting when query is about a road vs a business) ── -_ROAD_KEYWORDS = ( - set(_STREET_TYPE_ABBREVS.keys()) - | set(_STREET_TYPE_ABBREVS.values()) - | {'route', 'rte', 'pass'} -) - -# ── US state codes ── -_STATE_CODES = { - 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', - 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', - 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', - 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', - 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC', -} - -# ── Full state name → code (for intent classifier) ── -_STATE_NAME_TO_CODE = { - 'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR', - 'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE', - 'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID', - 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS', - 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD', - 'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', - 'mississippi': 'MS', 'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE', - 'nevada': 'NV', 'new hampshire': 'NH', 'new jersey': 'NJ', - 'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC', - 'north dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK', 'oregon': 'OR', - 'pennsylvania': 'PA', 'rhode island': 'RI', 'south carolina': 'SC', - 'south dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT', - 'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA', - 'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY', -} - -# Coordinate regex -_COORD_RE = re.compile(r'^\s*(-?\d+\.?\d*)\s*[,\s]\s*(-?\d+\.?\d*)\s*$') - - -# ═══════════════════════════════════════════════════════════════════ -# STEP 1: PREPROCESSING -# ═══════════════════════════════════════════════════════════════════ - -def _parse_coords(text): - """Return (lat, lon) if text looks like coordinates with valid bounds, else None.""" - m = _COORD_RE.match(text.strip()) - if not m: - return None - lat, lon = float(m.group(1)), float(m.group(2)) - if -90 <= lat <= 90 and -180 <= lon <= 180: - return lat, lon - return None - - -def _classify_and_parse(query): - """ - Parse query with usaddress, classify intent, expand abbreviations. - - Returns (intent, parsed_dict) where: - intent: 'ADDRESS' | 'POI' | 'LOCALITY' | 'POSTCODE' | 'COORD' | 'UNKNOWN' - parsed_dict: {number, street, city, state, zipcode, raw_query, expanded_query} - """ - q = query.strip() - parsed = { - 'number': None, 'street': None, 'street_raw': None, - 'city': None, 'state': None, - 'zipcode': None, 'raw_query': q, 'expanded_query': q, - } - - # Coordinate check first - if _parse_coords(q): - return 'COORD', parsed - - # Try usaddress - try: - tagged, addr_type = usaddress.tag(q) - except usaddress.RepeatedLabelError: - # Ambiguous input — fall back to free-text Photon - return 'UNKNOWN', parsed - - # Extract components - number = tagged.get('AddressNumber', '').strip() - street_name = tagged.get('StreetName', '').strip() - street_pre_dir = tagged.get('StreetNamePreDirectional', '').strip() - street_post_type = tagged.get('StreetNamePostType', '').strip() - place = tagged.get('PlaceName', '').strip() - state = tagged.get('StateName', '').strip() - zipcode = tagged.get('ZipCode', '').strip() - - # ── Fix usaddress edge case: "214 N St Filer" ── - # usaddress reads single-letter directional + "St" as PreDirectional + empty, - # mashing "St Filer" into StreetName. Detect: PreDirectional is single letter, - # StreetName has 2+ tokens where the first is a street type. - if (street_pre_dir and len(street_pre_dir) <= 2 - and not street_name.strip().startswith(street_pre_dir) - and ' ' in street_name): - name_tokens = street_name.split() - first_lower = name_tokens[0].lower() - if first_lower in _STREET_TYPE_ABBREVS or first_lower in _STREET_TYPE_ABBREVS.values(): - # "N" is actually the street name, "St" is the post-type - street_name = street_pre_dir - street_post_type = name_tokens[0] - if len(name_tokens) > 1: - place = ' '.join(name_tokens[1:]) - street_pre_dir = '' - - # ── Expand abbreviations (guard ordinals) ── - expanded_parts = [] - - if number: - parsed['number'] = number - expanded_parts.append(number) - - if street_pre_dir: - exp = _DIRECTIONAL_ABBREVS.get(street_pre_dir.lower(), street_pre_dir) - expanded_parts.append(exp) - - if street_name: - # Don't expand ordinals: "21st" stays "21st" - if _ORDINAL_RE.match(street_name): - expanded_parts.append(street_name) - else: - # Expand directional abbreviation if it IS the street name - exp = _DIRECTIONAL_ABBREVS.get(street_name.lower(), street_name) - expanded_parts.append(exp) - parsed['street'] = street_name - - if street_post_type: - if _ORDINAL_RE.match(street_post_type): - expanded_parts.append(street_post_type) - else: - exp = _STREET_TYPE_ABBREVS.get(street_post_type.lower(), street_post_type) - expanded_parts.append(exp) - - # Build raw street (original abbreviations, for Netsyms) and expanded (for Photon) - raw_street_parts = [] - if street_pre_dir: - raw_street_parts.append(street_pre_dir) - if street_name: - raw_street_parts.append(street_name) - if street_post_type: - raw_street_parts.append(street_post_type) - parsed['street_raw'] = ' '.join(raw_street_parts) - - # Build the full expanded street - if expanded_parts: - # The street is everything after the number - street_full = ' '.join(expanded_parts[1:] if number else expanded_parts) - parsed['street'] = street_full - - if place: - parsed['city'] = place - expanded_parts.append(place) - if state: - parsed['state'] = state.upper() - expanded_parts.append(state) - if zipcode: - parsed['zipcode'] = zipcode - expanded_parts.append(zipcode) - - parsed['expanded_query'] = ' '.join(expanded_parts) - - # ── Intent classification ── - if addr_type == 'Street Address' and number: - return 'ADDRESS', parsed - elif zipcode and not number and not street_name: - return 'POSTCODE', parsed - elif addr_type == 'Ambiguous': - # Check if it looks like a locality: last token(s) are a state code or name - tokens = q.replace(',', ' ').split() - if len(tokens) >= 2: - last_upper = tokens[-1].upper() - if last_upper in _STATE_CODES: - parsed['city'] = ' '.join(tokens[:-1]) - parsed['state'] = last_upper - return 'LOCALITY', parsed - # Check full state names (single-word like "idaho" or two-word like "new york") - last_lower = tokens[-1].lower() - if last_lower in _STATE_NAME_TO_CODE: - parsed['city'] = ' '.join(tokens[:-1]) - parsed['state'] = _STATE_NAME_TO_CODE[last_lower] - return 'LOCALITY', parsed - if len(tokens) >= 3: - two_word = f"{tokens[-2].lower()} {last_lower}" - if two_word in _STATE_NAME_TO_CODE: - parsed['city'] = ' '.join(tokens[:-2]) - parsed['state'] = _STATE_NAME_TO_CODE[two_word] - return 'LOCALITY', parsed - return 'UNKNOWN', parsed - else: - return 'UNKNOWN', parsed - - -# ═══════════════════════════════════════════════════════════════════ -# STEP 2: RETRIEVAL -# ═══════════════════════════════════════════════════════════════════ - -def _retrieve_netsyms(parsed, limit=10, lat=None, lon=None): - """Query Netsyms for structured address lookup. Returns list of candidate dicts.""" - try: - from . import netsyms - except Exception: - return [] - - results = [] - number = parsed.get('number', '') - street = parsed.get('street_raw') or parsed.get('street', '') - city = parsed.get('city', '') - state = parsed.get('state', '') - zipcode = parsed.get('zipcode', '') - - # When viewport provided, fetch more results to sort from - fetch_limit = 200 if (lat is not None and lon is not None) else limit - - if number and street: - rows = netsyms.lookup_by_street( - number, street, city=city, state=state, zipcode=zipcode, limit=fetch_limit - ) - elif zipcode: - rows = netsyms.lookup_by_zipcode(zipcode, limit=fetch_limit) - else: - return [] - - for row in rows: - addr_parts = [row['number'], row['street']] - if row.get('street2'): - addr_parts.append(row['street2']) - addr_parts.extend([row['city'], row['state'], row['zipcode']]) - display = ' '.join(p for p in addr_parts if p) - results.append({ - 'name': display, - 'lat': row['lat'], - 'lon': row['lon'], - 'source': 'netsyms', - 'type': 'street_address', - 'raw': row, - '_number': row.get('number', ''), - '_street': row.get('street', ''), - '_city': row.get('city', ''), - '_state': row.get('state', ''), - }) - # Sort by viewport distance if lat/lon provided, then limit - if lat is not None and lon is not None and results: - results.sort(key=lambda r: (r["lat"] - lat)**2 + (r["lon"] - lon)**2) - results = results[:limit] - return results - - -def _retrieve_photon_structured(parsed, limit=10): - """Query Photon /structured endpoint for address lookup.""" - params = {'limit': limit, 'countrycode': 'US'} - if parsed.get('street'): - params['street'] = parsed['street'] - if parsed.get('number'): - params['housenumber'] = parsed['number'] - if parsed.get('city'): - params['city'] = parsed['city'] - if parsed.get('state'): - params['state'] = parsed['state'] - - if 'street' not in params: - return [] - - try: - resp = requests.get(f"{PHOTON_URL}/structured", params=params, timeout=5) - resp.raise_for_status() - data = resp.json() - except Exception as e: - logger.debug("Photon /structured failed: %s", e) - return [] - - return _parse_photon_features(data.get('features', []), 'photon') - - -def _retrieve_photon_freetext(query, limit=10, lat=None, lon=None, zoom=None): - """Query Photon /api for free-text search with location bias.""" - try: - params = { - 'q': query, - 'limit': limit, - 'lat': lat if lat is not None else GEOCODE_BIAS_LAT, - 'lon': lon if lon is not None else GEOCODE_BIAS_LON, - 'zoom': int(zoom) if zoom is not None else GEOCODE_BIAS_ZOOM, - } - resp = requests.get(f"{PHOTON_URL}/api", params=params, timeout=5) - resp.raise_for_status() - data = resp.json() - except Exception as e: - return [] - - return _parse_photon_features(data.get('features', []), 'photon') - - -def _parse_photon_features(features, source): - """Convert Photon GeoJSON features to candidate dicts.""" - results = [] - for i, feature in enumerate(features): - props = feature.get('properties', {}) - coords = feature.get('geometry', {}).get('coordinates', [0, 0]) - - osm_key = props.get('osm_key', '') - osm_value = props.get('osm_value', '') - feat_type = props.get('type', '') - has_hn = bool(props.get('housenumber')) - - if osm_key in ('amenity', 'shop', 'tourism', 'leisure', 'office'): - rtype = 'poi' - elif has_hn or osm_value in ('house', 'residential'): - rtype = 'street_address' - elif feat_type in ('city', 'town', 'village', 'hamlet', 'county', 'state', 'country'): - rtype = 'locality' - else: - rtype = 'poi' - - # Build display name - parts = [] - hn = props.get('housenumber') - street = props.get('street') - name = props.get('name', '') - if hn and street: - parts.append(f"{hn} {street}") - if name and name != street: - parts.append(name) - elif name: - parts.append(name) - elif street: - parts.append(street) - for key in ('city', 'county', 'state', 'country'): - v = props.get(key) - if v and (not parts or v != parts[-1]): - parts.append(v) - display = ', '.join(p for p in parts if p) or 'Unknown' - - results.append({ - 'name': display, - 'lat': coords[1], - 'lon': coords[0], - 'source': source, - 'type': rtype, - 'raw': props, - '_photon_rank': i, - '_number': props.get('housenumber', ''), - '_street': props.get('street', ''), - # For locality results, the name IS the city (Photon omits 'city' on city-type features) - '_city': props.get('city', '') or (props.get('name', '') if rtype == 'locality' else ''), - '_state': props.get('state', ''), - }) - return results - - -# ═══════════════════════════════════════════════════════════════════ -# STEP 3: RERANKER -# ═══════════════════════════════════════════════════════════════════ - -def _expand_street_type(s): - """Expand a street type abbreviation for comparison.""" - return _STREET_TYPE_ABBREVS.get(s.lower(), s.lower()) - - -def _score_candidate(candidate, parsed, intent): - """ - Score a candidate against the parsed query. - Returns (total_score, signal_breakdown_dict). - """ - signals = {} - total = 0.0 - - query_number = (parsed.get('number') or '').strip().upper() - query_street = (parsed.get('street') or '').strip().upper() - query_city = (parsed.get('city') or '').strip().upper() - query_state = (parsed.get('state') or '').strip().upper() - - cand_number = (candidate.get('_number') or '').strip().upper() - cand_street = (candidate.get('_street') or '').strip().upper() - cand_city = (candidate.get('_city') or '').strip().upper() - cand_state = (candidate.get('_state') or '').strip().upper() - - # ── Housenumber ── - if intent == 'ADDRESS' and query_number: - if cand_number == query_number: - signals['housenumber_exact'] = W_HOUSENUMBER_EXACT - total += W_HOUSENUMBER_EXACT - elif cand_number and cand_number != query_number: - signals['housenumber_mismatch'] = W_HOUSENUMBER_MISMATCH - total += W_HOUSENUMBER_MISMATCH - - # ── Street name fuzz ── - if query_street and cand_street: - # Expand both for comparison - q_expanded = ' '.join(_expand_street_type(t) for t in query_street.split()) - c_expanded = ' '.join(_expand_street_type(t) for t in cand_street.split()) - ratio = fuzz.token_sort_ratio(q_expanded, c_expanded) / 100.0 - score = ratio * W_STREET_NAME_FUZZ - signals['street_name_fuzz'] = round(score, 2) - total += score - - # ── Street type match ── - if query_street and cand_street: - q_tokens = set(_expand_street_type(t) for t in query_street.split()) - c_tokens = set(_expand_street_type(t) for t in cand_street.split()) - # Check if the street type words overlap - street_types = set(_STREET_TYPE_ABBREVS.values()) - q_types = q_tokens & street_types - c_types = c_tokens & street_types - if q_types and q_types & c_types: - signals['street_type_match'] = W_STREET_TYPE_MATCH - total += W_STREET_TYPE_MATCH - - # ── Token coverage ── - raw_q = parsed.get('raw_query', '').upper() - q_tokens = set(raw_q.replace(',', ' ').split()) - if q_tokens: - cand_text = candidate.get('name', '').upper() - matched = sum(1 for t in q_tokens if t in cand_text) - coverage = matched / len(q_tokens) - score = coverage * W_TOKEN_COVERAGE - signals['token_coverage'] = round(score, 2) - total += score - - # ── Locality fuzz ── - if query_city and cand_city: - ratio = fuzz.ratio(query_city, cand_city) / 100.0 - score = ratio * W_LOCALITY_FUZZ - signals['locality_fuzz'] = round(score, 2) - total += score - - # ── State exact ── - if query_state and cand_state: - if cand_state == query_state: - signals['state_exact'] = W_STATE_EXACT - total += W_STATE_EXACT - - # ── Source authority ── - if candidate.get('source') == 'netsyms' and intent == 'ADDRESS': - signals['source_authority'] = W_SOURCE_AUTHORITY - total += W_SOURCE_AUTHORITY - - # ── Layer rank (type-appropriate bonus) ── - cand_type = candidate.get('type', '') - if intent == 'ADDRESS' and cand_type == 'street_address': - signals['layer_rank'] = W_LAYER_RANK - total += W_LAYER_RANK - elif intent == 'LOCALITY' and cand_type == 'locality': - signals['layer_rank'] = W_LAYER_RANK - total += W_LAYER_RANK - elif intent == 'POI' and cand_type == 'poi': - signals['layer_rank'] = W_LAYER_RANK - total += W_LAYER_RANK - - # ── Photon position normalization ── - photon_rank = candidate.get('_photon_rank') - if photon_rank is not None: - # Top result gets full bonus, decays linearly - score = max(0, (1.0 - photon_rank / 10.0)) * W_PHOTON_POSITION_NORM - signals['photon_position'] = round(score, 2) - total += score - - # ── Business intent POI boost ── - # When the query has no road keywords (likely a business/POI search), - # boost amenity/shop/etc results and penalize highway/route results. - # Skipped for LOCALITY, POSTCODE, COORD queries where class is irrelevant. - if intent not in ('LOCALITY', 'POSTCODE', 'COORD'): - q_tokens_lower = set(parsed.get('raw_query', '').lower().replace(',', ' ').split()) - if not (q_tokens_lower & _ROAD_KEYWORDS): - osm_key = (candidate.get('raw') or {}).get('osm_key', '') - if osm_key in ('amenity', 'shop', 'tourism', 'leisure', 'office', 'craft'): - signals['poi_class_boost'] = W_POI_CLASS_BOOST - total += W_POI_CLASS_BOOST - elif osm_key in ('highway', 'route'): - signals['highway_class_penalty'] = W_HIGHWAY_CLASS_PENALTY - total += W_HIGHWAY_CLASS_PENALTY - - return round(total, 2), signals - - -def _build_match_code(candidate, parsed, intent): - """Build a match_code dict indicating match quality for each field.""" - mc = {} - if intent == 'ADDRESS': - q_num = (parsed.get('number') or '').strip().upper() - c_num = (candidate.get('_number') or '').strip().upper() - if q_num and c_num == q_num: - mc['housenumber'] = 'matched' - elif q_num and c_num: - mc['housenumber'] = 'unmatched' - elif q_num and not c_num: - mc['housenumber'] = 'inferred' - - q_street = (parsed.get('street') or '').strip().upper() - c_street = (candidate.get('_street') or '').strip().upper() - if q_street and c_street: - q_exp = ' '.join(_expand_street_type(t) for t in q_street.split()) - c_exp = ' '.join(_expand_street_type(t) for t in c_street.split()) - ratio = fuzz.token_sort_ratio(q_exp, c_exp) / 100.0 - mc['street'] = 'matched' if ratio > 0.8 else 'unmatched' - elif q_street: - mc['street'] = 'inferred' - - q_city = (parsed.get('city') or '').strip().upper() - c_city = (candidate.get('_city') or '').strip().upper() - if q_city and c_city: - ratio = fuzz.ratio(q_city, c_city) / 100.0 - mc['city'] = 'matched' if ratio > 0.8 else 'unmatched' - elif q_city: - mc['city'] = 'inferred' - - return mc - - -def _rerank(candidates, parsed, intent, query, limit): - """Score, sort, and trim candidates. Trace-log top 3.""" - scored = [] - for c in candidates: - total, signals = _score_candidate(c, parsed, intent) - c['_score'] = total - c['_signals'] = signals - scored.append(c) - - scored.sort(key=lambda c: c['_score'], reverse=True) - - # Trace log for audit - _trace_logger.debug("─── Query: %r intent=%s ───", query, intent) - for i, c in enumerate(scored): - osm_key = (c.get('raw') or {}).get('osm_key', '—') - osm_val = (c.get('raw') or {}).get('osm_value', '—') - _trace_logger.debug( - " #%d score=%.2f src=%s key=%s/%s name=%s", - i, c['_score'], c.get('source', '?'), osm_key, osm_val, - c.get('name', '?')[:60] - ) - _trace_logger.debug(" signals=%s", c.get('_signals', {})) - - # Clean internal fields and add match_code - result = [] - for c in scored[:limit]: - mc = _build_match_code(c, parsed, intent) - - # Assign confidence from score - score = c.get('_score', 0) - if score >= 10: - confidence = 'exact' - elif score >= 5: - confidence = 'high' - elif score >= 2: - confidence = 'medium' - else: - confidence = 'low' - - entry = { - 'name': c['name'], - 'lat': c['lat'], - 'lon': c['lon'], - 'source': c['source'], - 'confidence': confidence, - 'type': c.get('type', 'poi'), - 'raw': c.get('raw'), - } - if mc: - entry['match_code'] = mc - result.append(entry) - - return result - - -# ═══════════════════════════════════════════════════════════════════ -# STEP 4: ANNOTATION -# ═══════════════════════════════════════════════════════════════════ - -def _haversine_m(lat1, lon1, lat2, lon2): - """Haversine distance in meters.""" - R = 6_371_000 - rlat1, rlat2 = math.radians(lat1), math.radians(lat2) - dlat = math.radians(lat2 - lat1) - dlon = math.radians(lon2 - lon1) - a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 - return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) - - -def _annotate_with_address_book(results): - """Add labeled_as to results within radius of an address book entry.""" - try: - from . import address_book - entries = address_book.load() - except Exception: - return - for result in results: - rlat, rlon = result.get('lat'), result.get('lon') - if rlat is None or rlon is None: - continue - for entry in entries: - elat, elon = entry.get('lat'), entry.get('lon') - if elat is None or elon is None: - continue - if _haversine_m(rlat, rlon, elat, elon) <= ADDRESS_BOOK_ANNOTATION_RADIUS_M: - result['labeled_as'] = entry['name'] - break - - -# ═══════════════════════════════════════════════════════════════════ -# PUBLIC API -# ═══════════════════════════════════════════════════════════════════ - -def geocode(query, limit=10, lat=None, lon=None, zoom=None): - """ - Structured geocoding with multi-source retrieval and reranking. - - Returns {query, results: [...], count} — always 200-safe. - """ - limit = max(1, min(limit, 20)) - q = (query or '').strip() - empty = {'query': q, 'results': [], 'count': 0} - - if not q: - return empty - - # ── Coordinate detection ── - coords = _parse_coords(q) - if coords: - return { - 'query': q, - 'results': [{ - 'name': q, - 'lat': coords[0], - 'lon': coords[1], - 'source': 'coordinates', - 'confidence': 'exact', - 'type': 'coordinates', - 'raw': None, - }], - 'count': 1, - } - - # ── Address book nickname short-circuit ── - normalized_q = ' '.join(q.lower().replace(',', ' ').split()) - is_single_word = ' ' not in normalized_q - try: - from . import address_book - ab_match = address_book.lookup(q) - if (ab_match - and ab_match['confidence'] == 'exact' - and ab_match.get('lat') and ab_match.get('lon') - and is_single_word): - logger.info("geocode: nickname short-circuit %r → %s", q, ab_match['name']) - return { - 'query': q, - 'results': [{ - 'name': ab_match.get('address') or ab_match['name'], - 'lat': ab_match['lat'], - 'lon': ab_match['lon'], - 'source': 'address_book', - 'confidence': 'exact', - 'type': 'nickname', - 'raw': ab_match, - }], - 'count': 1, - } - except Exception as e: - logger.debug("geocode: address_book lookup failed: %s", e) - - # ── Classify intent + parse ── - intent, parsed = _classify_and_parse(q) - logger.debug("geocode: intent=%s parsed=%s", intent, parsed) - - # ── Retrieve candidates ── - candidates = [] - - if intent == 'ADDRESS': - # Parallel: Netsyms (structured) + Photon (freetext with expanded query) - netsyms_results = _retrieve_netsyms(parsed, limit=limit, lat=lat, lon=lon) - photon_results = _retrieve_photon_freetext( - parsed.get('expanded_query', q), limit=limit, lat=lat, lon=lon, zoom=zoom - ) - # Also try Photon /structured for addresses - photon_struct = _retrieve_photon_structured(parsed, limit=5) - candidates = netsyms_results + photon_results + photon_struct - - elif intent == 'POSTCODE': - netsyms_results = _retrieve_netsyms(parsed, limit=limit, lat=lat, lon=lon) - photon_results = _retrieve_photon_freetext(q, limit=limit, lat=lat, lon=lon, zoom=zoom) - candidates = netsyms_results + photon_results - - elif intent in ('LOCALITY', 'POI', 'UNKNOWN'): - candidates = _retrieve_photon_freetext(q, limit=limit, lat=lat, lon=lon, zoom=zoom) - - # ── Deduplicate by (lat, lon) proximity ── - deduped = [] - for c in candidates: - is_dup = False - for existing in deduped: - if (_haversine_m(c['lat'], c['lon'], existing['lat'], existing['lon']) < 50 - and c.get('source') == existing.get('source')): - is_dup = True - break - if not is_dup: - deduped.append(c) - candidates = deduped - - # ── Rerank ── - results = _rerank(candidates, parsed, intent, q, limit) - - # ── Address book annotation ── - _annotate_with_address_book(results) - - logger.info("geocode: %r → intent=%s, %d results", q, intent, len(results)) - return {'query': q, 'results': results, 'count': len(results)} diff --git a/lib/geocode_test.py b/lib/geocode_test.py deleted file mode 100644 index 4717b1e..0000000 --- a/lib/geocode_test.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for RECON Photon-first geocode chain.""" -import sys -import os -import json -import urllib.request -import urllib.parse - -BASE = "http://localhost:8420" - -TESTS = [ - { - "name": "home → nickname short-circuit", - "query": "home", - "check": lambda r: ( - r["count"] == 1 - and r["results"][0]["source"] == "address_book" - and r["results"][0]["confidence"] == "exact" - and r["results"][0]["type"] == "nickname" - ), - }, - { - "name": "214 north st filer → netsyms exact match (multi-word, not nickname)", - "query": "214 north st filer", - "check": lambda r: ( - r["count"] >= 1 - and r["results"][0]["source"] == "netsyms" - and r["results"][0]["confidence"] == "exact" - and r["results"][0]["type"] == "street_address" - ), - }, - { - "name": "214 North St, Filer, ID → netsyms (case/punctuation)", - "query": "214 North St, Filer, ID", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms", - }, - { - "name": "214 NORTH ST FILER ID → netsyms (uppercase)", - "query": "214 NORTH ST FILER ID", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms", - }, - { - "name": "1600 Pennsylvania Ave Washington DC → White House", - "query": "1600 Pennsylvania Ave Washington DC", - "check": lambda r: ( - r["count"] >= 1 - and r["results"][0]["source"] == "photon" - ), - }, - { - "name": "1600 pennsylvania ave washington dc → lowercase", - "query": "1600 pennsylvania ave washington dc", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", - }, - { - "name": "starbucks filer → POI result", - "query": "starbucks filer", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", - }, - { - "name": "filer idaho → locality", - "query": "filer idaho", - "check": lambda r: ( - r["count"] >= 1 - and r["results"][0]["source"] == "photon" - and r["results"][0]["type"] == "locality" - ), - }, - { - "name": "filer → partial query, at least 1 result", - "query": "filer", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", - }, - { - "name": "42.5736, -114.6066 → coordinates (with space)", - "query": "42.5736, -114.6066", - "check": lambda r: ( - r["count"] == 1 - and r["results"][0]["source"] == "coordinates" - and r["results"][0]["confidence"] == "exact" - and r["results"][0]["type"] == "coordinates" - ), - }, - { - "name": "42.5736,-114.6066 → coordinates (no space)", - "query": "42.5736,-114.6066", - "check": lambda r: ( - r["count"] == 1 - and r["results"][0]["source"] == "coordinates" - and r["results"][0]["confidence"] == "exact" - ), - }, - { - "name": "boise → at least 1 result", - "query": "boise", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", - }, - { - "name": "toronto → CA canary", - "query": "toronto", - "check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon", - }, - { - "name": "asdfghjklqwerty → empty results, 200 OK", - "query": "asdfghjklqwerty", - "check": lambda r: r["count"] == 0 and r["results"] == [], - }, - { - "name": "empty query → empty results", - "query": "", - "check": lambda r: r["count"] == 0 and r["results"] == [], - }, -] - -passed = 0 -failed = 0 - -for t in TESTS: - q = urllib.parse.urlencode({"q": t["query"]}) if t["query"] else "q=" - url = f"{BASE}/api/geocode?{q}" - try: - req = urllib.request.Request(url) - with urllib.request.urlopen(req, timeout=10) as resp: - status = resp.status - body = json.loads(resp.read()) - except urllib.error.HTTPError as e: - status = e.code - try: - body = json.loads(e.read()) - except Exception: - body = {} - except Exception as e: - status = 0 - body = {} - print(f" [FAIL] {t['name']}") - print(f" EXCEPTION: {e}") - failed += 1 - continue - - ok = status == 200 and t["check"](body) - tag = "PASS" if ok else "FAIL" - if ok: - passed += 1 - else: - failed += 1 - - top = body.get("results", [{}])[0] if body.get("results") else {} - top_summary = f"source={top.get('source','—')} type={top.get('type','—')} conf={top.get('confidence','—')} name={top.get('name','—')[:50]}" - print(f" [{tag}] {t['name']}") - if not ok: - print(f" HTTP {status}, count={body.get('count','?')}, top: {top_summary}") - else: - labeled = f" labeled_as={top.get('labeled_as')}" if top.get('labeled_as') else "" - print(f" → {top_summary}{labeled}") - -print(f"\n{passed} passed, {failed} failed") -sys.exit(0 if failed == 0 else 1) diff --git a/lib/google_places.py b/lib/google_places.py deleted file mode 100644 index 8272b81..0000000 --- a/lib/google_places.py +++ /dev/null @@ -1,397 +0,0 @@ -""" -Google Places (New) API client for tertiary enrichment. - -Searches for business POIs and fetches details (opening hours, phone, website) -when OSM + Overture data is incomplete. Uses field masks to minimize cost. - -API docs: https://developers.google.com/maps/documentation/places/web-service -""" -import json -import os -import sqlite3 -import time -from datetime import date, timezone, datetime - -import requests - -from .utils import setup_logging - -logger = setup_logging('recon.google_places') - -API_BASE = 'https://places.googleapis.com/v1' -DEFAULT_DAILY_CAP = 500 -REQUEST_TIMEOUT = 3 # seconds - -# Google day index → OSM abbreviation -_DAY_ABBR = ['Su', 'Mo', 'Tu', 'We', 'Th', 'Fr', 'Sa'] - -_db_conn = None - - -def _get_db(): - """Return a module-level SQLite connection (lazy init).""" - global _db_conn - if _db_conn is not None: - return _db_conn - - db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') - db_path = os.path.join(db_dir, 'place_cache.db') - _db_conn = sqlite3.connect(db_path, check_same_thread=False) - _db_conn.execute("PRAGMA journal_mode=WAL") - _db_conn.execute("PRAGMA synchronous=NORMAL") - # Ensure google_api_calls table exists - _db_conn.execute(""" - CREATE TABLE IF NOT EXISTS google_api_calls ( - call_date TEXT PRIMARY KEY, - call_count INTEGER NOT NULL DEFAULT 0 - ) - """) - _db_conn.commit() - return _db_conn - - -def _get_api_key(): - """Return the Google Places API key from environment.""" - key = os.environ.get('GOOGLE_PLACES_API_KEY') - if not key: - logger.error("GOOGLE_PLACES_API_KEY not set in environment") - return key - - -def _get_daily_cap(): - """Return the daily API call cap (configurable via deployment config).""" - try: - from .deployment_config import get_deployment_config - config = get_deployment_config() - return config.get('google_places', {}).get('daily_cap', DEFAULT_DAILY_CAP) - except Exception: - return DEFAULT_DAILY_CAP - - -# ── Daily call counter ────────────────────────────────────────────────── - -def check_daily_cap(): - """Return True if under daily cap, False if limit reached.""" - db = _get_db() - today = date.today().isoformat() - row = db.execute( - "SELECT call_count FROM google_api_calls WHERE call_date = ?", (today,) - ).fetchone() - current = row[0] if row else 0 - cap = _get_daily_cap() - if current >= cap: - logger.info(f"google_places: daily_cap_reached count={current} cap={cap}") - return False - return True - - -def get_daily_count(): - """Return today's API call count.""" - db = _get_db() - today = date.today().isoformat() - row = db.execute( - "SELECT call_count FROM google_api_calls WHERE call_date = ?", (today,) - ).fetchone() - return row[0] if row else 0 - - -def increment_call_counter(): - """Atomically increment today's API call counter.""" - db = _get_db() - today = date.today().isoformat() - db.execute(""" - INSERT INTO google_api_calls (call_date, call_count) VALUES (?, 1) - ON CONFLICT(call_date) DO UPDATE SET call_count = call_count + 1 - """, (today,)) - db.commit() - - -def _set_daily_count_to_cap(): - """Set today's counter to the cap value (soft-stop on quota error).""" - db = _get_db() - today = date.today().isoformat() - cap = _get_daily_cap() - db.execute(""" - INSERT INTO google_api_calls (call_date, call_count) VALUES (?, ?) - ON CONFLICT(call_date) DO UPDATE SET call_count = ? - """, (today, cap, cap)) - db.commit() - - -# ── Google Places cache (on place_cache table) ───────────────────────── - -def cache_get_google(osm_type, osm_id): - """Return (google_place_id, google_data_dict) or (None, None).""" - db = _get_db() - row = db.execute( - "SELECT google_place_id, google_data FROM place_cache WHERE osm_type=? AND osm_id=?", - (osm_type, osm_id) - ).fetchone() - if row and row[0]: - data = None - if row[1]: - try: - data = json.loads(row[1]) - except (json.JSONDecodeError, TypeError): - pass - return row[0], data - return None, None - - -def cache_put_google(osm_type, osm_id, place_id, data): - """Store Google Places data for a cache entry (UPSERT on google columns).""" - db = _get_db() - now = int(time.time()) - db.execute(""" - INSERT INTO place_cache (osm_type, osm_id, data, source, cached_at, google_place_id, google_data, google_fetched_at) - VALUES (?, ?, '', 'pending', 0, ?, ?, ?) - ON CONFLICT(osm_type, osm_id) DO UPDATE SET - google_place_id = excluded.google_place_id, - google_data = excluded.google_data, - google_fetched_at = excluded.google_fetched_at - """, (osm_type, osm_id, place_id, json.dumps(data) if data else None, now)) - db.commit() - - -# ── API calls ─────────────────────────────────────────────────────────── - -def search_place(name, lat, lon, radius_m=200): - """ - Search Google Places (New) for a business by name + location. - Returns the Google Place ID of the best match, or None. - """ - key = _get_api_key() - if not key: - return None - - if not check_daily_cap(): - return None - - try: - resp = requests.post( - f'{API_BASE}/places:searchText', - headers={ - 'Content-Type': 'application/json', - 'X-Goog-Api-Key': key, - 'X-Goog-FieldMask': 'places.id,places.displayName,places.location', - }, - json={ - 'textQuery': name, - 'locationBias': { - 'circle': { - 'center': {'latitude': lat, 'longitude': lon}, - 'radius': float(radius_m), - } - }, - 'maxResultCount': 1, - }, - timeout=REQUEST_TIMEOUT, - ) - - increment_call_counter() - - if resp.status_code == 429: - logger.warning("google_places: action=search place=%s result=rate_limited", name) - _set_daily_count_to_cap() - return None - - if resp.status_code == 403: - logger.error("google_places: action=search place=%s result=forbidden (invalid key?)", name) - return None - - if resp.status_code != 200: - logger.warning("google_places: action=search place=%s result=error status=%d", name, resp.status_code) - return None - - data = resp.json() - places = data.get('places', []) - if not places: - logger.info("google_places: action=search place=%s result=miss", name) - return None - - place_id = places[0].get('id') - display = places[0].get('displayName', {}).get('text', '?') - logger.info("google_places: action=search place=%s result=hit google_name=%s id=%s", name, display, place_id) - return place_id - - except requests.exceptions.Timeout: - logger.warning("google_places: action=search place=%s result=timeout", name) - return None - except Exception as e: - logger.error("google_places: action=search place=%s result=error err=%s", name, e) - return None - - -def get_place_details(place_id): - """ - Fetch details for a Google Place ID. - Returns dict with {opening_hours, phone_number, website} or None. - """ - key = _get_api_key() - if not key: - return None - - if not check_daily_cap(): - return None - - try: - resp = requests.get( - f'{API_BASE}/places/{place_id}', - headers={ - 'X-Goog-Api-Key': key, - 'X-Goog-FieldMask': 'regularOpeningHours,internationalPhoneNumber,websiteUri', - }, - timeout=REQUEST_TIMEOUT, - ) - - increment_call_counter() - - if resp.status_code == 429: - logger.warning("google_places: action=details id=%s result=rate_limited", place_id) - _set_daily_count_to_cap() - return None - - if resp.status_code != 200: - logger.warning("google_places: action=details id=%s result=error status=%d", place_id, resp.status_code) - return None - - data = resp.json() - result = { - 'opening_hours': None, - 'opening_hours_raw': None, - 'phone_number': None, - 'website': None, - } - - # Phone - phone = data.get('internationalPhoneNumber') - if phone: - result['phone_number'] = phone.replace(' ', '').replace('-', '') - - # Website - result['website'] = data.get('websiteUri') - - # Opening hours - hours = data.get('regularOpeningHours') - if hours: - # Try OSM-compatible format from periods - periods = hours.get('periods', []) - if periods: - osm_str = _periods_to_osm(periods) - if osm_str: - result['opening_hours'] = osm_str - - # Fallback: weekday descriptions (human-readable) - if not result['opening_hours']: - descriptions = hours.get('weekdayDescriptions') - if descriptions: - result['opening_hours_raw'] = descriptions - - logger.info("google_places: action=details id=%s result=hit hours=%s phone=%s website=%s", - place_id, - 'yes' if result['opening_hours'] or result['opening_hours_raw'] else 'no', - 'yes' if result['phone_number'] else 'no', - 'yes' if result['website'] else 'no') - return result - - except requests.exceptions.Timeout: - logger.warning("google_places: action=details id=%s result=timeout", place_id) - return None - except Exception as e: - logger.error("google_places: action=details id=%s result=error err=%s", place_id, e) - return None - - -# ── Opening hours conversion ──────────────────────────────────────────── - -def _periods_to_osm(periods): - """ - Convert Google Places periods array to OSM opening_hours string. - - Google periods: [{"open": {"day": 0-6, "hour": H, "minute": M}, - "close": {"day": 0-6, "hour": H, "minute": M}}, ...] - Where day 0 = Sunday. - - OSM format: "Mo-Fr 06:00-23:00; Sa-Su 07:00-23:00" - """ - if not periods: - return None - - # Check for 24/7: single period with no close, or open 00:00 close 00:00 next day - if len(periods) == 1: - p = periods[0] - o = p.get('open', {}) - c = p.get('close') - if c is None and o.get('hour', 0) == 0 and o.get('minute', 0) == 0: - return '24/7' - - # Build a map: day_index → "HH:MM-HH:MM" - day_hours = {} # day_index → time_range string - for p in periods: - o = p.get('open', {}) - c = p.get('close', {}) - day = o.get('day', 0) - open_time = f"{o.get('hour', 0):02d}:{o.get('minute', 0):02d}" - - if c: - close_time = f"{c.get('hour', 0):02d}:{c.get('minute', 0):02d}" - # Handle midnight closing (00:00 means end of day) - if close_time == '00:00': - close_time = '24:00' - else: - close_time = '24:00' - - time_range = f"{open_time}-{close_time}" - - # A day can have multiple periods (e.g., lunch break) - if day in day_hours: - day_hours[day] = day_hours[day] + ',' + time_range - else: - day_hours[day] = time_range - - if not day_hours: - return None - - # Check if all 7 days have same hours - unique_ranges = set(day_hours.values()) - if len(day_hours) == 7 and len(unique_ranges) == 1: - hours = unique_ranges.pop() - if hours == '00:00-24:00': - return '24/7' - return hours # implicit "every day" - - # Group consecutive days with same hours - # Reorder to OSM convention: Mo(1) Tu(2) We(3) Th(4) Fr(5) Sa(6) Su(0) - osm_day_order = [1, 2, 3, 4, 5, 6, 0] - groups = [] - current_days = [] - current_hours = None - - for day_idx in osm_day_order: - hours = day_hours.get(day_idx) - if hours == current_hours: - current_days.append(day_idx) - else: - if current_days and current_hours: - groups.append((current_days, current_hours)) - current_days = [day_idx] - current_hours = hours - - if current_days and current_hours: - groups.append((current_days, current_hours)) - - if not groups: - return None - - # Format each group - parts = [] - for days, hours in groups: - if len(days) == 1: - day_str = _DAY_ABBR[days[0]] - elif len(days) == 2: - day_str = f"{_DAY_ABBR[days[0]]},{_DAY_ABBR[days[1]]}" - else: - day_str = f"{_DAY_ABBR[days[0]]}-{_DAY_ABBR[days[-1]]}" - parts.append(f"{day_str} {hours}") - - return '; '.join(parts) diff --git a/lib/landclass.py b/lib/landclass.py deleted file mode 100644 index f581994..0000000 --- a/lib/landclass.py +++ /dev/null @@ -1,252 +0,0 @@ -""" -PAD-US land classification lookup. - -Provides point-in-polygon queries against the USGS Protected Areas Database -(PAD-US) stored in a local PostGIS database. Returns land ownership, -management, and public access information for any lat/lon coordinate. - -Connection pool is lazy-initialized on first call. If PostgreSQL is unreachable, -functions return empty results gracefully (feature degrades, doesn't crash). -""" -import os - -import psycopg2 -import psycopg2.pool - -from .utils import setup_logging - -logger = setup_logging('recon.landclass') - -_pool = None -_pool_failed = False - -# ── Label mappings from PAD-US domain tables ──────────────────────────── -# Extracted from PADUS4_0_Geodatabase.gdb domain lookup layers. -# ogr2ogr lowercases all column names. - -AGENCY_NAME_MAP = { - 'TVA': 'Tennessee Valley Authority', - 'BLM': 'Bureau of Land Management', - 'BOEM': 'Bureau of Ocean Energy Management', - 'USBR': 'Bureau of Reclamation', - 'FWS': 'U.S. Fish and Wildlife Service', - 'USFS': 'Forest Service', - 'DOD': 'Department of Defense', - 'USACE': 'Army Corps of Engineers', - 'DOE': 'Department of Energy', - 'NPS': 'National Park Service', - 'NRCS': 'Natural Resources Conservation Service', - 'ARS': 'Agricultural Research Service', - 'BIA': 'Bureau of Indian Affairs', - 'NOAA': 'National Oceanic and Atmospheric Administration', - 'BPA': 'Bonneville Power Administration', - 'OTHF': 'Other or Unknown Federal Land', - 'TRIB': 'American Indian Lands', - 'SPR': 'State Park and Recreation', - 'SDC': 'State Department of Conservation', - 'SLB': 'State Land Board', -} - -AGENCY_TYPE_MAP = { - 'FED': 'Federal', - 'TRIB': 'American Indian Lands', - 'STAT': 'State', - 'DIST': 'Regional Agency Special District', - 'LOC': 'Local Government', - 'NGO': 'Non-Governmental Organization', - 'PVT': 'Private', - 'JNT': 'Joint', - 'UNK': 'Unknown', - 'TERR': 'Territorial', - 'DESG': 'Designation', -} - -DESIGNATION_TYPE_MAP = { - 'NP': 'National Park', - 'NM': 'National Monument', - 'NCA': 'Conservation Area', - 'NF': 'National Forest', - 'NG': 'National Grassland', - 'PUB': 'National Public Lands', - 'NT': 'National Scenic or Historic Trail', - 'NWR': 'National Wildlife Refuge', - 'WA': 'Wilderness Area', - 'WSR': 'Wild and Scenic River', - 'WSA': 'Wilderness Study Area', - 'MPA': 'Marine Protected Area', - 'NRA': 'National Recreation Area', - 'NSBV': 'National Scenic, Botanical or Volcanic Area', - 'NLS': 'National Lakeshore or Seashore', - 'IRA': 'Inventoried Roadless Area', - 'ACEC': 'Area of Critical Environmental Concern', - 'RNA': 'Research Natural Area', - 'REC': 'Recreation Management Area', - 'RMA': 'Resource Management Area', - 'WPA': 'Watershed Protection Area', - 'REA': 'Research or Educational Area', - 'HCA': 'Historic or Cultural Area', - 'MIT': 'Mitigation Land or Bank', - 'MIL': 'Military Land', - 'ACC': 'Access Area', - 'SDA': 'Special Designation Area', - 'PROC': 'Approved or Proclamation Boundary', - 'FOTH': 'Federal Other or Unknown', - 'ND': 'Not Designated', -} - -PUBLIC_ACCESS_MAP = { - 'OA': 'Open Access', - 'RA': 'Restricted Access', - 'XA': 'Closed', - 'UK': 'Unknown', -} - -GAP_STATUS_MAP = { - '1': 'Managed for biodiversity (disturbance events proceed)', - '2': 'Managed for biodiversity (disturbance suppressed)', - '3': 'Multiple uses (extractive/OHV)', - '4': 'No known mandate for biodiversity protection', -} - -CATEGORY_MAP = { - 'Fee': 'Fee', - 'Easement': 'Easement', - 'Other': 'Other', - 'Unknown': 'Unknown', - 'Designation': 'Designation', - 'Marine': 'Marine Area', - 'Proclamation': 'Approved, Proclamation or Extent Boundary', -} - -STATE_MAP = { - 'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', - 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', - 'DC': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', - 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', - 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', - 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', - 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', - 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', - 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', - 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', - 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', - 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', - 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming', -} - - -def _decode(code, label_map): - """Decode a PAD-US code using a label map. Returns decoded label or the raw code.""" - if not code: - return '' - code = str(code).strip() - return label_map.get(code, code) - - -def _get_pool(): - """Lazy-init the connection pool. Returns None if Postgres is unreachable.""" - global _pool, _pool_failed - if _pool is not None: - return _pool - if _pool_failed: - return None - - try: - _pool = psycopg2.pool.SimpleConnectionPool( - minconn=1, - maxconn=3, - host=os.environ.get('PADUS_DB_HOST', 'localhost'), - port=int(os.environ.get('PADUS_DB_PORT', '5432')), - dbname=os.environ.get('PADUS_DB_NAME', 'padus'), - user=os.environ.get('PADUS_DB_USER', 'overture'), - password=os.environ.get('PADUS_DB_PASSWORD', ''), - connect_timeout=5, - ) - logger.info("PAD-US PostgreSQL connection pool initialized") - return _pool - except Exception as e: - _pool_failed = True - logger.warning(f"PAD-US PostgreSQL unavailable, land classification disabled: {e}") - return None - - -def _query_all(sql, params): - """Execute a query and return all rows as a list of dicts, or empty list.""" - pool = _get_pool() - if pool is None: - return [] - - conn = None - try: - conn = pool.getconn() - with conn.cursor() as cur: - cur.execute(sql, params) - rows = cur.fetchall() - if not rows: - return [] - cols = [desc[0] for desc in cur.description] - return [dict(zip(cols, row)) for row in rows] - except Exception as e: - logger.warning(f"PAD-US query error: {e}") - if conn: - try: - conn.rollback() - except Exception: - pass - return [] - finally: - if conn: - try: - pool.putconn(conn) - except Exception: - pass - - -def lookup_landclass(lat, lon): - """ - Look up PAD-US land classifications for a point. - - Returns a list of classification dicts, ordered by area ascending - (smallest/most specific first). Empty list on error or no results. - """ - rows = _query_all( - """SELECT unit_nm, mang_name, mang_type, own_name, own_type, - des_tp, gap_sts, pub_access, category, gis_acres, state_nm - FROM pad_units - WHERE ST_Intersects(geom, ST_SetSRID(ST_MakePoint(%s, %s), 4326)) - ORDER BY gis_acres ASC - LIMIT 10""", - (lon, lat) - ) - - results = [] - for row in rows: - pa_code = str(row.get('pub_access', '')).strip() - - results.append({ - 'unit_name': (row.get('unit_nm') or '').strip(), - 'manager_name': _decode(row.get('mang_name'), AGENCY_NAME_MAP), - 'manager_type': _decode(row.get('mang_type'), AGENCY_TYPE_MAP), - 'owner_type': _decode(row.get('own_type'), AGENCY_TYPE_MAP), - 'designation_type': _decode(row.get('des_tp'), DESIGNATION_TYPE_MAP), - 'gap_status': str(row.get('gap_sts', '')).strip(), - 'public_access': _decode(pa_code, PUBLIC_ACCESS_MAP), - 'public_access_code': pa_code, - 'category': _decode(row.get('category'), CATEGORY_MAP), - 'acres': row.get('gis_acres'), - 'state': _decode(row.get('state_nm'), STATE_MAP), - }) - - return results - - -def format_summary(classifications): - """ - Format a human-readable summary from classification results. - - Returns the most specific unit name, or None if no results. - """ - if not classifications: - return None - # First result is smallest/most specific (ordered by acres ASC) - return classifications[0].get('unit_name') or None diff --git a/lib/nav_tools.py b/lib/nav_tools.py deleted file mode 100644 index d4bb1f7..0000000 --- a/lib/nav_tools.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Navigation tools: geocoding via Photon and routing via Valhalla.""" - -import math -import re -import requests - -from .utils import setup_logging - -logger = setup_logging('recon.nav_tools') - -PHOTON_URL = "http://localhost:2322" -VALHALLA_URL = "http://localhost:8002" - -# Regional bias for Photon searches (Idaho-centric for Matt's use case). -# Adjustable — Photon uses these to rank nearby results higher. -GEOCODE_BIAS_LAT = 42.5736 -GEOCODE_BIAS_LON = -114.6066 -GEOCODE_BIAS_ZOOM = 10 - -# Distance threshold (meters) for annotating Photon results with address -# book labels. 75m covers GPS jitter + geocoder imprecision. -ADDRESS_BOOK_ANNOTATION_RADIUS_M = 75 - -# Coordinate regex — handles comma-separated and space-separated forms. -_COORD_RE = re.compile( - r'^\s*(-?\d+\.\d+)\s*[,\s]\s*(-?\d+\.\d+)\s*$' -) - -VALID_MODES = {"auto", "pedestrian", "bicycle", "truck"} - - -def _parse_coords(text: str): - """Return (lat, lon) if text looks like coordinates with valid bounds, else None.""" - m = _COORD_RE.match(text.strip()) - if not m: - return None - lat, lon = float(m.group(1)), float(m.group(2)) - if -90 <= lat <= 90 and -180 <= lon <= 180: - return lat, lon - return None - - -def _haversine_m(lat1, lon1, lat2, lon2): - """Haversine distance in meters between two (lat, lon) points.""" - R = 6_371_000 # Earth radius in meters - rlat1, rlat2 = math.radians(lat1), math.radians(lat2) - dlat = math.radians(lat2 - lat1) - dlon = math.radians(lon2 - lon1) - a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2 - return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) - - -def geocode(query: str, limit: int = 10, lat=None, lon=None, zoom=None): - """Delegate to the structured geocode module. See lib/geocode.py.""" - from . import geocode as geocode_mod - return geocode_mod.geocode(query, limit=limit, lat=lat, lon=lon, zoom=zoom) - - -def _geocode(query: str): - """Internal: returns (lat, lon, display_name) tuple for route().""" - result = geocode(query, limit=1) - results = result.get('results', []) - if not results: - raise ValueError(f"Could not find location: {query}") - top = results[0] - return top['lat'], top['lon'], top['name'] - - -def reverse_geocode(lat: float, lon: float) -> str: - """Reverse geocode coordinates via Photon. Returns formatted address string.""" - try: - resp = requests.get( - f"{PHOTON_URL}/reverse", - params={"lat": lat, "lon": lon, "limit": 1}, - timeout=10, - ) - resp.raise_for_status() - except requests.RequestException: - raise RuntimeError("Navigation service unavailable") - - data = resp.json() - features = data.get("features", []) - if not features: - return f"{lat}, {lon}" - - props = features[0]["properties"] - parts = [] - for key in ("name", "housenumber", "street", "city", "state", "country", "postcode"): - v = props.get(key) - if v: - parts.append(v) - return ", ".join(parts) if parts else f"{lat}, {lon}" - - -def route(origin: str, destination: str, mode: str = "auto") -> dict: - """ - Get a route between two locations. - - Args: - origin: Starting location — address, place name, or "lat,lon" - destination: Destination — address, place name, or "lat,lon" - mode: Travel mode — auto, pedestrian, bicycle, truck - - Returns: - dict with summary, maneuvers, origin/destination info, and raw shape - """ - if mode not in VALID_MODES: - mode = "auto" - - # Geocode both endpoints - orig_lat, orig_lon, orig_name = _geocode(origin) - dest_lat, dest_lon, dest_name = _geocode(destination) - - # Query Valhalla - valhalla_req = { - "locations": [ - {"lat": orig_lat, "lon": orig_lon}, - {"lat": dest_lat, "lon": dest_lon}, - ], - "costing": mode, - "directions_options": {"units": "miles"}, - } - - try: - resp = requests.post( - f"{VALHALLA_URL}/route", - json=valhalla_req, - timeout=30, - ) - except requests.RequestException: - raise RuntimeError("Navigation service unavailable") - - if resp.status_code != 200: - try: - err = resp.json() - msg = err.get("error", "Unknown routing error") - except Exception: - msg = f"Routing error (HTTP {resp.status_code})" - raise RuntimeError(f"No route found between locations: {msg}") - - data = resp.json() - trip = data["trip"] - summary = trip["summary"] - leg = trip["legs"][0] - - # Build maneuver list - maneuvers = [] - for m in leg["maneuvers"]: - streets = m.get("street_names", []) - maneuvers.append({ - "instruction": m["instruction"], - "distance_miles": round(m.get("length", 0), 2), - "street_name": streets[0] if streets else "", - "type": m.get("type", 0), - "verbal_succinct": m.get("verbal_succinct_transition_instruction", ""), - }) - - return { - "origin": {"name": orig_name, "lat": orig_lat, "lon": orig_lon}, - "destination": {"name": dest_name, "lat": dest_lat, "lon": dest_lon}, - "summary": { - "distance_miles": round(summary["length"], 1), - "time_minutes": round(summary["time"] / 60, 1), - "mode": mode, - }, - "maneuvers": maneuvers, - "shape": leg.get("shape", ""), - } diff --git a/lib/nav_tools_test.py b/lib/nav_tools_test.py deleted file mode 100644 index b987293..0000000 --- a/lib/nav_tools_test.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Tests for nav_tools — run against live Photon + Valhalla services.""" - -import sys -import json - -from nav_tools import route, reverse_geocode - - -def test_route_named(): - """route("Buhl Idaho", "Boise Idaho", "auto") returns maneuvers.""" - print("TEST 1: route('Buhl Idaho', 'Boise Idaho', 'auto')") - r = route("Buhl Idaho", "Boise Idaho", "auto") - assert r["summary"]["distance_miles"] > 50, f"Expected >50 mi, got {r['summary']['distance_miles']}" - assert r["summary"]["time_minutes"] > 60, f"Expected >60 min, got {r['summary']['time_minutes']}" - assert len(r["maneuvers"]) > 5, f"Expected >5 maneuvers, got {len(r['maneuvers'])}" - assert r["shape"], "Missing polyline shape" - print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min, {len(r['maneuvers'])} maneuvers") - print(f" Origin: {r['origin']['name']}") - print(f" Destination: {r['destination']['name']}") - print(f" First maneuver: {r['maneuvers'][0]['instruction']}") - - -def test_route_coords(): - """route with raw lat,lon coordinates.""" - print("\nTEST 2: route('42.5991,-114.7636', '43.615,-116.2023', 'auto')") - r = route("42.5991,-114.7636", "43.615,-116.2023", "auto") - assert r["summary"]["distance_miles"] > 100, f"Expected >100 mi, got {r['summary']['distance_miles']}" - assert len(r["maneuvers"]) > 3, f"Expected >3 maneuvers" - print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min") - - -def test_route_pedestrian(): - """route with pedestrian mode.""" - print("\nTEST 3: route('Buhl Idaho', 'Boise Idaho', 'pedestrian')") - r = route("Buhl Idaho", "Boise Idaho", "pedestrian") - assert r["summary"]["mode"] == "pedestrian" - assert r["summary"]["time_minutes"] > r["summary"]["distance_miles"], "Walking should take more min than miles" - print(f" OK — {r['summary']['distance_miles']} mi, {r['summary']['time_minutes']} min (pedestrian)") - - -def test_reverse_geocode(): - """reverse_geocode near Buhl, Idaho.""" - print("\nTEST 4: reverse_geocode(42.5991, -114.7636)") - result = reverse_geocode(42.5991, -114.7636) - assert "Buhl" in result or "Twin Falls" in result or "Idaho" in result, f"Expected Buhl/Idaho, got: {result}" - print(f" OK — {result}") - - -def test_route_bad_origin(): - """route with nonexistent place returns clean error.""" - print("\nTEST 5: route('nonexistent place xyz123abc', 'Boise Idaho')") - try: - r = route("nonexistent place xyz123abc", "Boise Idaho") - print(f" FAIL — expected error, got result: {r['summary']}") - return False - except ValueError as e: - print(f" OK — clean error: {e}") - except RuntimeError as e: - print(f" OK — runtime error: {e}") - - -if __name__ == "__main__": - passed = 0 - failed = 0 - tests = [test_route_named, test_route_coords, test_route_pedestrian, test_reverse_geocode, test_route_bad_origin] - - for test in tests: - try: - test() - passed += 1 - except Exception as e: - print(f" FAIL — {e}") - failed += 1 - - print(f"\n{'='*40}") - print(f"Results: {passed} passed, {failed} failed out of {len(tests)}") - sys.exit(1 if failed else 0) diff --git a/lib/netsyms_api.py b/lib/netsyms_api.py index 4a0847f..dbae24e 100644 --- a/lib/netsyms_api.py +++ b/lib/netsyms_api.py @@ -1,22 +1,18 @@ """ -RECON Netsyms API + Geocode — Flask Blueprints. +RECON Netsyms API — Flask Blueprint. GET /api/netsyms/lookup?q=&country= GET /api/netsyms/health -GET /api/geocode?q=&limit= (Photon-first search with ranked results) """ from flask import Blueprint, request, jsonify from . import netsyms -from . import address_book -from . import nav_tools from .utils import setup_logging logger = setup_logging('recon.netsyms_api') netsyms_bp = Blueprint('netsyms', __name__) -geocode_bp = Blueprint('geocode', __name__) @netsyms_bp.route('/api/netsyms/lookup') @@ -33,94 +29,3 @@ def api_netsyms_lookup(): @netsyms_bp.route('/api/netsyms/health') def api_netsyms_health(): return jsonify(netsyms.health()) - - - -def _safe_float(val, lo, hi): - """Parse val as float; return None if missing, non-numeric, or out of [lo, hi].""" - if val is None: - return None - try: - f = float(val) - if lo <= f <= hi: - return f - except (ValueError, TypeError): - pass - return None - -@geocode_bp.route('/api/geocode') -def api_geocode(): - """ - Photon-first geocoding with ranked candidates. - - GET /api/geocode?q=&limit= - - Always returns 200 OK with: - {query, results: [{name, lat, lon, source, confidence, type, raw, ...}], count} - - - source: "address_book" | "coordinates" | "photon" - - confidence: "exact" | "high" | "medium" | "low" - - type: "nickname" | "coordinates" | "street_address" | "poi" | "locality" - - labeled_as: present when result is within 75m of an address book entry - - Empty results array is valid (no match). No 404s. - """ - q = request.args.get('q', '').strip() - limit = request.args.get('limit', '10') - try: - limit = max(1, min(int(limit), 20)) - except (ValueError, TypeError): - limit = 10 - - # Viewport bias parameters (optional) - lat = _safe_float(request.args.get("lat"), -90, 90) - lon = _safe_float(request.args.get("lon"), -180, 180) - zoom = _safe_float(request.args.get("zoom"), 0, 22) - - result = nav_tools.geocode(q, limit=limit, lat=lat, lon=lon, zoom=zoom) - return jsonify(result) - - -@geocode_bp.route('/api/reverse') -def api_reverse(): - """ - Reverse geocode coordinates via Photon. - - GET /api/reverse?lat=X&lon=Y - - Returns same shape as /api/geocode: - {query: "lat,lon", results: [{name, lat, lon, source, type, raw, ...}], count} - - Returns 200 OK with empty results on no match. 400 on invalid coords. - """ - try: - lat = float(request.args.get('lat', '')) - lon = float(request.args.get('lon', '')) - except (ValueError, TypeError): - return jsonify({'error': 'Missing or invalid lat/lon parameters'}), 400 - - if not (-90 <= lat <= 90) or not (-180 <= lon <= 180): - return jsonify({'error': 'Coordinates out of range'}), 400 - - query_str = f"{lat},{lon}" - - try: - import requests as http_requests - resp = http_requests.get( - "http://localhost:2322/reverse", - params={"lat": lat, "lon": lon, "limit": 1}, - timeout=10, - ) - resp.raise_for_status() - data = resp.json() - features = data.get("features", []) - except Exception: - logger.warning("Photon reverse geocode failed for %s", query_str) - return jsonify({'query': query_str, 'results': [], 'count': 0}) - - if not features: - return jsonify({'query': query_str, 'results': [], 'count': 0}) - - from .geocode import _parse_photon_features - results = _parse_photon_features(features, source='photon_reverse') - - return jsonify({'query': query_str, 'results': results, 'count': len(results)}) diff --git a/lib/osm_categories.py b/lib/osm_categories.py deleted file mode 100644 index dd5217c..0000000 --- a/lib/osm_categories.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -Human-readable category names for OSM class/type pairs. - -Used by the place detail proxy to turn ("amenity", "cafe") into "Coffee shop". -Covers ~50 common categories; unmapped pairs fall back to title-cased class:type. -""" - -# Exact (class, type) → label -CATEGORY_MAP = { - # Amenity - ("amenity", "cafe"): "Coffee shop", - ("amenity", "restaurant"): "Restaurant", - ("amenity", "fast_food"): "Fast food restaurant", - ("amenity", "bar"): "Bar", - ("amenity", "pub"): "Pub", - ("amenity", "biergarten"): "Beer garden", - ("amenity", "ice_cream"): "Ice cream shop", - ("amenity", "fuel"): "Gas station", - ("amenity", "charging_station"): "EV charging station", - ("amenity", "parking"): "Parking", - ("amenity", "bank"): "Bank", - ("amenity", "atm"): "ATM", - ("amenity", "pharmacy"): "Pharmacy", - ("amenity", "hospital"): "Hospital", - ("amenity", "clinic"): "Clinic", - ("amenity", "dentist"): "Dentist", - ("amenity", "doctors"): "Doctor's office", - ("amenity", "veterinary"): "Veterinarian", - ("amenity", "school"): "School", - ("amenity", "university"): "University", - ("amenity", "college"): "College", - ("amenity", "library"): "Library", - ("amenity", "post_office"): "Post office", - ("amenity", "fire_station"): "Fire station", - ("amenity", "police"): "Police station", - ("amenity", "townhall"): "Town hall", - ("amenity", "place_of_worship"): "Place of worship", - ("amenity", "theatre"): "Theatre", - ("amenity", "cinema"): "Cinema", - ("amenity", "community_centre"): "Community center", - ("amenity", "toilets"): "Restrooms", - ("amenity", "drinking_water"): "Drinking water", - ("amenity", "shelter"): "Shelter", - ("amenity", "camping"): "Campground", - # Shop - ("shop", "supermarket"): "Supermarket", - ("shop", "convenience"): "Convenience store", - ("shop", "hardware"): "Hardware store", - ("shop", "clothes"): "Clothing store", - ("shop", "car_repair"): "Auto repair", - ("shop", "car"): "Car dealership", - ("shop", "bakery"): "Bakery", - ("shop", "butcher"): "Butcher", - # Leisure - ("leisure", "park"): "Park", - ("leisure", "playground"): "Playground", - ("leisure", "sports_centre"): "Sports center", - ("leisure", "swimming_pool"): "Swimming pool", - ("leisure", "golf_course"): "Golf course", - ("leisure", "nature_reserve"): "Nature reserve", - ("leisure", "campsite"): "Campsite", - # Tourism - ("tourism", "hotel"): "Hotel", - ("tourism", "motel"): "Motel", - ("tourism", "guest_house"): "Guest house", - ("tourism", "hostel"): "Hostel", - ("tourism", "camp_site"): "Campsite", - ("tourism", "viewpoint"): "Viewpoint", - ("tourism", "museum"): "Museum", - ("tourism", "information"): "Information", - ("tourism", "attraction"): "Tourist attraction", - ("tourism", "picnic_site"): "Picnic site", - # Natural - ("natural", "peak"): "Peak", - ("natural", "spring"): "Spring", - ("natural", "hot_spring"): "Hot spring", - ("natural", "lake"): "Lake", - ("natural", "water"): "Water body", - ("natural", "cliff"): "Cliff", - ("natural", "cave_entrance"): "Cave", - # Highway - ("highway", "bus_stop"): "Bus stop", - ("highway", "rest_area"): "Rest area", - # Boundary - ("boundary", "administrative"): "Administrative boundary", - ("boundary", "protected_area"): "Protected area", - ("boundary", "national_park"): "National park", - # Place - ("place", "city"): "City", - ("place", "town"): "Town", - ("place", "village"): "Village", - ("place", "hamlet"): "Hamlet", - ("place", "suburb"): "Suburb", - ("place", "neighbourhood"): "Neighborhood", - # Building - ("building", "yes"): "Building", - # Waterway - ("waterway", "river"): "River", - ("waterway", "stream"): "Stream", - ("waterway", "waterfall"): "Waterfall", - # Landuse - ("landuse", "cemetery"): "Cemetery", - ("landuse", "forest"): "Forest", - # Historic - ("historic", "monument"): "Monument", - ("historic", "memorial"): "Memorial", - ("historic", "ruins"): "Ruins", -} - -# Class-level wildcard fallbacks (when exact type isn't mapped) -CLASS_FALLBACKS = { - "shop": "Shop", - "amenity": "Amenity", - "leisure": "Leisure", - "tourism": "Tourism", - "natural": "Natural feature", - "historic": "Historic site", -} - - -def humanize_category(osm_class, osm_type): - """Return a human-readable category string for an OSM class/type pair.""" - if not osm_class or not osm_type: - return "Place" - - osm_class = osm_class.lower() - osm_type = osm_type.lower() - - # Exact match - label = CATEGORY_MAP.get((osm_class, osm_type)) - if label: - return label - - # Class-level wildcard with formatted type - prefix = CLASS_FALLBACKS.get(osm_class) - if prefix: - nice_type = osm_type.replace("_", " ").title() - return f"{prefix}: {nice_type}" if prefix != nice_type else prefix - - # Generic fallback - nice_class = osm_class.replace("_", " ").title() - nice_type = osm_type.replace("_", " ").title() - return f"{nice_class}: {nice_type}" diff --git a/lib/overture.py b/lib/overture.py deleted file mode 100644 index fcbdd18..0000000 --- a/lib/overture.py +++ /dev/null @@ -1,170 +0,0 @@ -""" -Overture Maps enrichment layer. - -Provides lookup functions against the local PostgreSQL Overture Places database. -Two strategies: - 1. find_by_osm_id — exact match via OSM cross-reference index - 2. find_by_coords_and_name — spatial + fuzzy name fallback - -Connection pool is lazy-initialized on first call. If PostgreSQL is unreachable, -functions return None gracefully (feature degrades, doesn't crash). -""" -import json -import os - -import psycopg2 -import psycopg2.pool - -from .utils import setup_logging - -logger = setup_logging('recon.overture') - -_pool = None -_pool_failed = False - -# Map full OSM type names to single-letter codes used in Overture sources -OSM_TYPE_MAP = { - 'N': 'n', 'W': 'w', 'R': 'r', - 'node': 'n', 'way': 'w', 'relation': 'r', - 'n': 'n', 'w': 'w', 'r': 'r', -} - - -def _get_pool(): - """Lazy-init the connection pool. Returns None if Postgres is unreachable.""" - global _pool, _pool_failed - if _pool is not None: - return _pool - if _pool_failed: - return None - - try: - _pool = psycopg2.pool.SimpleConnectionPool( - minconn=1, - maxconn=3, - host=os.environ.get('OVERTURE_DB_HOST', 'localhost'), - port=int(os.environ.get('OVERTURE_DB_PORT', '5432')), - dbname=os.environ.get('OVERTURE_DB_NAME', 'overture'), - user=os.environ.get('OVERTURE_DB_USER', 'overture'), - password=os.environ.get('OVERTURE_DB_PASSWORD', ''), - connect_timeout=5, - ) - logger.info("Overture PostgreSQL connection pool initialized") - return _pool - except Exception as e: - _pool_failed = True - logger.warning(f"Overture PostgreSQL unavailable, enrichment disabled: {e}") - return None - - -def _query(sql, params): - """Execute a query and return the first row as a dict, or None.""" - pool = _get_pool() - if pool is None: - return None - - conn = None - try: - conn = pool.getconn() - with conn.cursor() as cur: - cur.execute(sql, params) - row = cur.fetchone() - if row is None: - return None - cols = [desc[0] for desc in cur.description] - return dict(zip(cols, row)) - except Exception as e: - logger.warning(f"Overture query error: {e}") - if conn: - try: - conn.rollback() - except Exception: - pass - return None - finally: - if conn: - try: - pool.putconn(conn) - except Exception: - pass - - -def _format_result(row, match_method): - """Convert a database row dict to the enrichment result shape.""" - if not row: - return None - - socials = row.get('socials') - if isinstance(socials, str): - try: - socials = json.loads(socials) - except (json.JSONDecodeError, TypeError): - socials = None - - return { - 'phone': row.get('phone'), - 'website': row.get('website'), - 'socials': socials, - 'brand_name': row.get('brand_name'), - 'brand_wikidata': row.get('brand_wikidata'), - 'basic_category': row.get('basic_category'), - 'confidence': row.get('confidence'), - 'gers_id': row.get('id'), - 'match_method': match_method, - } - - -def find_by_osm_id(osm_type, osm_id): - """ - Look up an Overture place by its OSM cross-reference. - - Args: - osm_type: OSM type — 'N', 'W', 'R', 'node', 'way', 'relation', or single letter - osm_id: OSM numeric ID - - Returns: - Enrichment dict or None - """ - type_letter = OSM_TYPE_MAP.get(osm_type) - if not type_letter: - return None - - row = _query( - """SELECT id, name, basic_category, confidence, - phone, website, socials, brand_name, brand_wikidata - FROM places - WHERE osm_type = %s AND osm_id = %s - LIMIT 1""", - (type_letter, int(osm_id)) - ) - return _format_result(row, 'osm_xref') - - -def find_by_coords_and_name(lat, lon, name, radius_m=100): - """ - Look up an Overture place by spatial proximity + fuzzy name match. - - Args: - lat: Latitude - lon: Longitude - name: Place name to fuzzy-match - radius_m: Search radius in meters (default 100) - - Returns: - Enrichment dict or None - """ - if not name or not lat or not lon: - return None - - row = _query( - """SELECT id, name, basic_category, confidence, - phone, website, socials, brand_name, brand_wikidata, - similarity(name, %s) AS sim - FROM places - WHERE ST_DWithin(geometry::geography, ST_MakePoint(%s, %s)::geography, %s) - AND similarity(name, %s) > 0.4 - ORDER BY sim DESC, ST_Distance(geometry::geography, ST_MakePoint(%s, %s)::geography) ASC - LIMIT 1""", - (name, lon, lat, radius_m, name, lon, lat) - ) - return _format_result(row, 'coord_name_fuzzy') diff --git a/lib/place_detail.py b/lib/place_detail.py deleted file mode 100644 index e85ee54..0000000 --- a/lib/place_detail.py +++ /dev/null @@ -1,817 +0,0 @@ -""" -Place detail proxy — local Nominatim first, Overpass API fallback, SQLite cache. -Overture Maps enrichment layer fills sparse extratags (phone, website, brand). - -Provides get_place_detail(osm_type, osm_id) which returns a cleaned dict -matching the response shape for /api/place//. -""" -import json -import os -import sqlite3 -import time - -import requests as http_requests - -from .osm_categories import humanize_category -from .utils import setup_logging - -logger = setup_logging('recon.place_detail') - -NOMINATIM_URL = "http://localhost:8010/details.php" -OVERPASS_URL = "https://overpass-api.de/api/interpreter" -OVERPASS_UA = "Navi/1.0 (forge.echo6.co/matt/recon)" -VALID_OSM_TYPES = {"N", "W", "R"} - -_db_conn = None - - -# ── SQLite cache ──────────────────────────────────────────────────────── - -def _get_db(): - """Return a module-level SQLite connection (lazy init).""" - global _db_conn - if _db_conn is not None: - return _db_conn - - db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') - os.makedirs(db_dir, exist_ok=True) - db_path = os.path.join(db_dir, 'place_cache.db') - - _db_conn = sqlite3.connect(db_path, check_same_thread=False) - _db_conn.execute("PRAGMA journal_mode=WAL") - _db_conn.execute("PRAGMA synchronous=NORMAL") - _db_conn.execute(""" - CREATE TABLE IF NOT EXISTS place_cache ( - osm_type TEXT NOT NULL, - osm_id INTEGER NOT NULL, - data TEXT NOT NULL, - source TEXT NOT NULL, - cached_at INTEGER NOT NULL, - PRIMARY KEY (osm_type, osm_id) - ) - """) - _db_conn.commit() - logger.info(f"Place cache DB ready at {db_path}") - return _db_conn - - -def cache_get(osm_type, osm_id): - """Return cached place dict or None.""" - db = _get_db() - row = db.execute( - "SELECT data FROM place_cache WHERE osm_type=? AND osm_id=?", - (osm_type, osm_id) - ).fetchone() - if row: - try: - result = json.loads(row[0]) - result['source'] = 'cache' - return result - except (json.JSONDecodeError, TypeError): - pass - return None - - -def cache_put(osm_type, osm_id, data, source): - """Store a place detail result in the cache (preserves google columns).""" - db = _get_db() - now = int(time.time()) - db.execute(""" - INSERT INTO place_cache (osm_type, osm_id, data, source, cached_at) - VALUES (?, ?, ?, ?, ?) - ON CONFLICT(osm_type, osm_id) DO UPDATE SET - data = excluded.data, - source = excluded.source, - cached_at = excluded.cached_at - """, (osm_type, osm_id, json.dumps(data), source, now)) - db.commit() - - -# ── Overture enrichment ───────────────────────────────────────────────── - -def _enrich_with_overture(result, osm_type, osm_id): - """ - Attempt to enrich a place result with Overture Maps data. - Fills sparse extratags (phone, website, brand) without overwriting existing values. - Returns the (possibly enriched) result dict. - """ - try: - from .deployment_config import get_deployment_config - deploy_config = get_deployment_config() - features = deploy_config.get('features', {}) - if not features.get('has_overture_enrichment', False): - return result - except Exception: - return result - - try: - from .overture import find_by_osm_id, find_by_coords_and_name - except ImportError: - logger.debug("Overture module not available") - return result - - enrichment = None - match_method = None - - # Strategy 1: OSM cross-reference (exact) - enrichment = find_by_osm_id(osm_type, osm_id) - if enrichment: - match_method = 'osm_xref' - - # Strategy 2: Coordinate + name fuzzy (fallback) - if not enrichment and result.get('centroid') and result.get('name'): - centroid = result['centroid'] - if centroid.get('lat') and centroid.get('lon'): - enrichment = find_by_coords_and_name( - centroid['lat'], centroid['lon'], result['name'] - ) - if enrichment: - match_method = 'coord_name_fuzzy' - - if not enrichment: - return result - - # Fill sparse extratags (never overwrite existing non-null values) - extratags = result.get('extratags', {}) - fill_map = [ - ('phone', 'phone'), - ('website', 'website'), - ('brand', 'brand_name'), - ('brand:wikidata', 'brand_wikidata'), - ] - for osm_key, overture_key in fill_map: - if not extratags.get(osm_key) and enrichment.get(overture_key): - extratags[osm_key] = enrichment[overture_key] - result['extratags'] = extratags - - # Add source metadata - result['sources'] = { - 'primary': result.get('source', 'unknown'), - 'enrichment': 'overture', - 'overture_match_method': match_method, - 'overture_gers_id': enrichment.get('gers_id'), - 'overture_confidence': enrichment.get('confidence'), - 'overture_basic_category': enrichment.get('basic_category'), - } - - logger.debug(f"Overture enrichment for {osm_type}/{osm_id}: {match_method}") - return result - - - -# ── Google Places enrichment (tertiary, gap-fill only) ────────────── - -# Business POI classes eligible for Google enrichment -_BUSINESS_CLASSES = {'amenity', 'shop', 'tourism', 'leisure', 'office', 'craft'} - -# Fields Google can fill -_GOOGLE_GAP_FIELDS = ('opening_hours', 'phone', 'website') - - -def _enrich_with_google(result, osm_type, osm_id): - """ - Tertiary enrichment via Google Places (New) API. - Only fires for business-type POIs when opening_hours, phone, or website - are still missing after OSM + Overture enrichment. - Fills only empty fields — never overwrites existing values. - """ - # Check feature flag - try: - from .deployment_config import get_deployment_config - deploy_config = get_deployment_config() - features = deploy_config.get('features', {}) - if not features.get('has_google_places_enrichment', False): - return result - except Exception: - return result - - # Only enrich business-type POIs - poi_class = result.get('class', '') - if poi_class not in _BUSINESS_CLASSES: - return result - - # Check if any gap fields are missing - extratags = result.get('extratags', {}) - gaps = [f for f in _GOOGLE_GAP_FIELDS if not extratags.get(f)] - if not gaps: - logger.debug(f"google_places: skip {osm_type}/{osm_id} — no gaps") - return result - - try: - from . import google_places - except ImportError: - logger.debug("google_places module not available") - return result - - # Check Google cache first - cached_pid, cached_data = google_places.cache_get_google(osm_type, osm_id) - if cached_pid and cached_data: - _apply_google_data(result, cached_data, gaps) - result.setdefault('sources', {})['google_places'] = { - 'place_id': cached_pid, - 'source': 'cache', - } - logger.debug(f"google_places: cache hit for {osm_type}/{osm_id}") - return result - - # Skip if already looked up and found nothing (cached_pid is None) - if cached_pid is not None: - return result - - # Skip new Google API calls for guest users (cached data already returned above) - from .auth import get_user_id - if not get_user_id(): - logger.debug(f"google_places: skip API call for {osm_type}/{osm_id} — guest user") - return result - - # Daily cap check - if not google_places.check_daily_cap(): - return result - - # Search for the place - name = result.get('name', '') - centroid = result.get('centroid', {}) - lat = centroid.get('lat') - lon = centroid.get('lon') - if not name or not lat or not lon: - return result - - place_id = google_places.search_place(name, lat, lon) - if not place_id: - # Cache the miss to avoid repeated lookups - google_places.cache_put_google(osm_type, osm_id, '__miss__', None) - return result - - # Get details - details = google_places.get_place_details(place_id) - if not details: - google_places.cache_put_google(osm_type, osm_id, place_id, None) - return result - - # Cache the result - google_places.cache_put_google(osm_type, osm_id, place_id, details) - - # Apply to result - _apply_google_data(result, details, gaps) - result.setdefault('sources', {})['google_places'] = { - 'place_id': place_id, - 'source': 'api', - 'daily_count': google_places.get_daily_count(), - } - - return result - - -def _apply_google_data(result, google_data, gaps): - """Apply Google Places data to fill gap fields only.""" - extratags = result.get('extratags', {}) - if 'opening_hours' in gaps: - osm_hours = google_data.get('opening_hours') - if osm_hours: - extratags['opening_hours'] = osm_hours - elif google_data.get('opening_hours_raw'): - extratags['opening_hours_raw'] = google_data['opening_hours_raw'] - if 'phone' in gaps and google_data.get('phone_number'): - extratags['phone'] = google_data['phone_number'] - if 'website' in gaps and google_data.get('website'): - extratags['website'] = google_data['website'] - result['extratags'] = extratags - - - - -# ── Wiki link rewriting ───────────────────────────────────────────────── - -# Extratag keys that may contain wiki references -_WIKI_TAGS = ('wikipedia', 'wikidata', 'wikivoyage', 'appropedia') - - -def _enrich_wiki_links(result): - """ - Rewrite wiki-related extratags to local Kiwix URLs where available. - Falls back to public URLs. Only runs when has_wiki_rewriting is enabled. - Returns the (possibly enriched) result dict. - """ - try: - from .deployment_config import get_deployment_config - deploy_config = get_deployment_config() - features = deploy_config.get('features', {}) - if not features.get('has_wiki_rewriting', False): - return result - except Exception: - return result - - try: - from .wiki_rewrite import rewrite_wiki_link - except ImportError: - logger.debug("wiki_rewrite module not available") - return result - - extratags = result.get('extratags', {}) - if not extratags: - return result - - rewrites = {} - for tag in _WIKI_TAGS: - value = extratags.get(tag) - if not value: - continue - url, status = rewrite_wiki_link(tag, value) - if status != 'original': - extratags[tag] = url - rewrites[tag] = status - - if rewrites: - result['extratags'] = extratags - result.setdefault('sources', {})['wiki_rewrites'] = rewrites - logger.debug(f"Wiki rewrites for {result.get('osm_type')}/{result.get('osm_id')}: {rewrites}") - - return result - -# ── Nominatim parsing ─────────────────────────────────────────────────── - -# Nominatim address array uses rank_address to indicate what each entry is. -# We map rank ranges to our flat address fields. -RANK_TO_FIELD = { - 4: 'country', - 5: 'postcode', - 6: 'state', # rank 6 = county in US, but we try name matching - 8: 'state', - 12: 'county', - 16: 'city', - 20: 'neighbourhood', - 22: 'neighbourhood', - 26: 'road', - 28: 'house_number', -} - - -def _parse_nominatim_address(address_array, country_code=None): - """Parse Nominatim's ranked address array into a flat address dict.""" - addr = { - 'house_number': None, - 'road': None, - 'neighbourhood': None, - 'city': None, - 'county': None, - 'state': None, - 'postcode': None, - 'country': None, - 'country_code': country_code, - } - - if not address_array: - return addr - - for entry in address_array: - if not entry.get('isaddress', False): - continue - - name = entry.get('localname', '') - rank = entry.get('rank_address', 0) - etype = entry.get('type', '') - eclass = entry.get('class', '') - - # Explicit type-based assignments (more reliable than rank alone) - if etype == 'country' and eclass == 'place': - addr['country'] = name - elif etype == 'state' or (eclass == 'boundary' and etype == 'administrative' and rank == 8): - if not addr['state']: - addr['state'] = name - elif etype == 'county' or (eclass == 'boundary' and etype == 'administrative' and rank in (10, 12)): - if not addr['county']: - addr['county'] = name - elif etype in ('city', 'town', 'village', 'hamlet') and eclass == 'place': - if not addr['city']: - addr['city'] = name - elif eclass == 'boundary' and etype == 'administrative' and rank == 16: - # City-level admin boundary (common in US) - if not addr['city']: - addr['city'] = name - elif etype == 'postcode': - addr['postcode'] = name - elif eclass == 'highway' or rank == 26: - if not addr['road']: - addr['road'] = name - elif etype == 'house_number' or rank == 28: - addr['house_number'] = name - elif rank in (20, 22) and not addr['neighbourhood']: - addr['neighbourhood'] = name - - # Remove county from output (not in spec) - addr.pop('county', None) - - return addr - - -def _parse_nominatim(data): - """Parse a Nominatim /details response into our canonical shape.""" - osm_type = data.get('osm_type', '') - osm_id = data.get('osm_id', 0) - osm_class = data.get('category', '') - osm_type_tag = data.get('type', '') - - # Centroid - centroid_geom = data.get('centroid', {}) - coords = centroid_geom.get('coordinates', [0, 0]) - centroid = {'lat': coords[1], 'lon': coords[0]} if len(coords) >= 2 else {'lat': 0, 'lon': 0} - - # Names - names = data.get('names', {}) - display_name = data.get('localname', '') or names.get('name', '') - - # Address - address = _parse_nominatim_address( - data.get('address', []), - country_code=data.get('country_code') - ) - - # Use calculated_postcode if address parse didn't find one - if not address.get('postcode') and data.get('calculated_postcode'): - address['postcode'] = data['calculated_postcode'] - - # Extratags - raw_extra = data.get('extratags', {}) - extratags = { - 'opening_hours': raw_extra.get('opening_hours'), - 'phone': raw_extra.get('phone') or raw_extra.get('contact:phone'), - 'website': raw_extra.get('website') or raw_extra.get('contact:website') or raw_extra.get('url'), - 'email': raw_extra.get('email') or raw_extra.get('contact:email'), - 'wikipedia': raw_extra.get('wikipedia'), - 'wikidata': raw_extra.get('wikidata'), - 'cuisine': raw_extra.get('cuisine'), - 'operator': raw_extra.get('operator'), - 'wheelchair': raw_extra.get('wheelchair'), - 'fee': raw_extra.get('fee'), - 'takeaway': raw_extra.get('takeaway'), - } - - # Category: use extratags.place for boundaries (e.g. "city"), else class/type - effective_class = osm_class - effective_type = osm_type_tag - if osm_class == 'boundary' and osm_type_tag == 'administrative': - place_tag = raw_extra.get('place') or raw_extra.get('linked_place') - if place_tag: - effective_class = 'place' - effective_type = place_tag - - category = humanize_category(effective_class, effective_type) - - # Filter names: only include extra name tags, not the bare "name" - extra_names = {k: v for k, v in names.items() if k != 'name'} if names else {} - - # Boundary geometry (polygon/multipolygon from Nominatim) - boundary = None - geom = data.get('geometry') - if geom and geom.get('type') in ('Polygon', 'MultiPolygon'): - boundary = geom - - return { - 'osm_type': osm_type, - 'osm_id': osm_id, - 'name': display_name, - 'category': category, - 'class': osm_class, - 'type': osm_type_tag, - 'address': address, - 'centroid': centroid, - 'extratags': extratags, - 'names': extra_names if extra_names else None, - 'source': 'nominatim_local', - 'boundary': boundary, - } - - -# ── Overpass parsing ──────────────────────────────────────────────────── - -OVERPASS_TYPE_MAP = {'N': 'node', 'W': 'way', 'R': 'relation'} - - -def _build_overpass_query(osm_type, osm_id): - """Build an Overpass QL query for a single element.""" - elem = OVERPASS_TYPE_MAP.get(osm_type) - if not elem: - return None - return f"[out:json][timeout:10];{elem}({osm_id});out tags center;" - - -def _parse_overpass(data, osm_type, osm_id): - """Parse an Overpass API response into our canonical shape.""" - elements = data.get('elements', []) - if not elements: - return None - - elem = elements[0] - tags = elem.get('tags', {}) - - # Centroid: Overpass returns lat/lon for nodes, center for ways/relations - lat = elem.get('lat') or (elem.get('center', {}).get('lat')) - lon = elem.get('lon') or (elem.get('center', {}).get('lon')) - centroid = {'lat': lat, 'lon': lon} if lat and lon else {'lat': 0, 'lon': 0} - - # Determine class/type from tags — Overpass doesn't have a canonical class field - # Use the first recognized class tag - osm_class = '' - osm_type_tag = '' - for cls in ('amenity', 'shop', 'leisure', 'tourism', 'natural', 'highway', - 'boundary', 'place', 'building', 'waterway', 'landuse', 'historic'): - if cls in tags: - osm_class = cls - osm_type_tag = tags[cls] - break - - category = humanize_category(osm_class, osm_type_tag) - - # Address from addr:* tags - address = { - 'house_number': tags.get('addr:housenumber'), - 'road': tags.get('addr:street'), - 'neighbourhood': tags.get('addr:suburb') or tags.get('addr:neighbourhood'), - 'city': tags.get('addr:city'), - 'state': tags.get('addr:state'), - 'postcode': tags.get('addr:postcode'), - 'country': tags.get('addr:country'), - 'country_code': tags.get('addr:country_code', - tags.get('addr:country', '')).lower()[:2] or None, - } - - # Extratags - extratags = { - 'opening_hours': tags.get('opening_hours'), - 'phone': tags.get('phone') or tags.get('contact:phone'), - 'website': tags.get('website') or tags.get('contact:website') or tags.get('url'), - 'email': tags.get('email') or tags.get('contact:email'), - 'wikipedia': tags.get('wikipedia'), - 'wikidata': tags.get('wikidata'), - 'cuisine': tags.get('cuisine'), - 'operator': tags.get('operator'), - 'wheelchair': tags.get('wheelchair'), - 'fee': tags.get('fee'), - 'takeaway': tags.get('takeaway'), - } - - # Names - name = tags.get('name', '') - extra_names = {} - for k, v in tags.items(): - if k.startswith('name:') or k in ('alt_name', 'old_name', 'short_name', 'official_name'): - extra_names[k] = v - - return { - 'osm_type': osm_type, - 'osm_id': osm_id, - 'name': name, - 'category': category, - 'class': osm_class, - 'type': osm_type_tag, - 'address': address, - 'centroid': centroid, - 'extratags': extratags, - 'names': extra_names if extra_names else None, - 'source': 'overpass', - } - - -# ── Public API ────────────────────────────────────────────────────────── - -def get_place_detail(osm_type, osm_id): - """ - Fetch place details for an OSM element. - - Returns (dict, status_code): - - (data, 200) on success - - (error_dict, 404) if not found in any source - - (error_dict, 502) if both sources error - """ - osm_type = osm_type.upper() - if osm_type not in VALID_OSM_TYPES: - return {'error': f'Invalid osm_type: {osm_type}. Must be N, W, or R.'}, 400 - - if osm_id <= 0: - return {'error': 'osm_id must be a positive integer'}, 400 - - # 1. Check cache - cached = cache_get(osm_type, osm_id) - if cached: - logger.debug(f"Cache hit: {osm_type}/{osm_id}") - return cached, 200 - - # 2. Try local Nominatim first - nominatim_result = None - nominatim_error = None - try: - resp = http_requests.get(NOMINATIM_URL, params={ - 'osmtype': osm_type, - 'osmid': osm_id, - 'format': 'json', - 'addressdetails': 1, - 'hierarchy': 0, - 'keywords': 0, - 'polygon_geojson': 1, - }, timeout=5) - - if resp.status_code == 200: - data = resp.json() - # Nominatim returns a result even for IDs not in its DB, - # but they'll have empty/minimal data. Check for osm_id match. - if data.get('osm_id') == osm_id: - nominatim_result = _parse_nominatim(data) - logger.debug(f"Nominatim hit: {osm_type}/{osm_id}") - except Exception as e: - nominatim_error = str(e) - logger.warning(f"Nominatim error for {osm_type}/{osm_id}: {e}") - - if nominatim_result: - nominatim_result = _enrich_with_overture(nominatim_result, osm_type, osm_id) - nominatim_result = _enrich_with_google(nominatim_result, osm_type, osm_id) - nominatim_result = _enrich_wiki_links(nominatim_result) - cache_put(osm_type, osm_id, nominatim_result, 'nominatim_local') - return nominatim_result, 200 - - # 3. Fallback to Overpass - overpass_result = None - overpass_error = None - try: - query = _build_overpass_query(osm_type, osm_id) - if query: - resp = http_requests.post( - OVERPASS_URL, - data={'data': query}, - headers={'User-Agent': OVERPASS_UA}, - timeout=10, - ) - if resp.status_code == 200: - data = resp.json() - overpass_result = _parse_overpass(data, osm_type, osm_id) - if overpass_result: - logger.debug(f"Overpass hit: {osm_type}/{osm_id}") - elif resp.status_code == 429: - overpass_error = "Overpass rate limited" - logger.warning(f"Overpass 429 for {osm_type}/{osm_id}") - else: - overpass_error = f"Overpass HTTP {resp.status_code}" - except Exception as e: - overpass_error = str(e) - logger.warning(f"Overpass error for {osm_type}/{osm_id}: {e}") - - if overpass_result: - overpass_result = _enrich_with_overture(overpass_result, osm_type, osm_id) - overpass_result = _enrich_with_google(overpass_result, osm_type, osm_id) - overpass_result = _enrich_wiki_links(overpass_result) - cache_put(osm_type, osm_id, overpass_result, 'overpass') - return overpass_result, 200 - - # 4. Both failed - if nominatim_error and overpass_error: - logger.error(f"Both sources failed for {osm_type}/{osm_id}: " - f"Nominatim={nominatim_error}, Overpass={overpass_error}") - return {'error': 'Both data sources unavailable'}, 502 - - # Not found in either source (no errors, just empty results) - return {'error': f'{osm_type}/{osm_id} not found'}, 404 - - -# ── Wikidata lookup ───────────────────────────────────────────────────── - -WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php" - -def get_place_by_wikidata(wikidata_id): - """ - Fetch place details from Wikidata entity. - - Returns (dict, status_code): - - (data, 200) on success - - (error_dict, 404) if entity not found - - (error_dict, 400) if invalid ID format - - (error_dict, 502) on API error - """ - # Validate wikidata ID format (Q followed by digits) - wikidata_id = wikidata_id.upper().strip() - if not wikidata_id.startswith("Q") or not wikidata_id[1:].isdigit(): - return {"error": f"Invalid wikidata ID: {wikidata_id}. Must be Q followed by digits."}, 400 - - try: - resp = http_requests.get(WIKIDATA_API_URL, params={ - "action": "wbgetentities", - "ids": wikidata_id, - "format": "json", - "languages": "en", - "props": "labels|descriptions|claims|sitelinks", - }, timeout=10, headers={"User-Agent": "Navi/1.0 (forge.echo6.co/matt/recon)"}) - - if resp.status_code != 200: - logger.warning(f"Wikidata API error for {wikidata_id}: HTTP {resp.status_code}") - return {"error": "Wikidata API error"}, 502 - - data = resp.json() - entities = data.get("entities", {}) - entity = entities.get(wikidata_id) - - if not entity or entity.get("missing"): - return {"error": f"Wikidata entity {wikidata_id} not found"}, 404 - - # Extract basic info - labels = entity.get("labels", {}) - descriptions = entity.get("descriptions", {}) - claims = entity.get("claims", {}) - - name = labels.get("en", {}).get("value", wikidata_id) - description = descriptions.get("en", {}).get("value", "") - - # Extract coordinates from P625 (coordinate location) - lat, lon = None, None - if "P625" in claims: - coord_claim = claims["P625"] - if coord_claim and coord_claim[0].get("mainsnak", {}).get("datavalue"): - coord_val = coord_claim[0]["mainsnak"]["datavalue"]["value"] - lat = coord_val.get("latitude") - lon = coord_val.get("longitude") - - # Extract population from P1082 - population = None - if "P1082" in claims: - pop_claims = claims["P1082"] - if pop_claims: - # Get the most recent population value - for claim in pop_claims: - if claim.get("mainsnak", {}).get("datavalue"): - try: - population = int(claim["mainsnak"]["datavalue"]["value"]["amount"].lstrip("+")) - break - except (KeyError, ValueError): - pass - - # Extract country from P17 - country = None - if "P17" in claims: - country_claims = claims["P17"] - if country_claims and country_claims[0].get("mainsnak", {}).get("datavalue"): - country_id = country_claims[0]["mainsnak"]["datavalue"]["value"]["id"] - # Could resolve this to a name, but for now just store the ID - - # Extract instance of (P31) for type classification - instance_of = [] - if "P31" in claims: - for claim in claims["P31"]: - if claim.get("mainsnak", {}).get("datavalue"): - instance_of.append(claim["mainsnak"]["datavalue"]["value"]["id"]) - - # Extract OSM relation ID if available (P402) - osm_relation_id = None - if "P402" in claims: - osm_claims = claims["P402"] - if osm_claims and osm_claims[0].get("mainsnak", {}).get("datavalue"): - osm_relation_id = osm_claims[0]["mainsnak"]["datavalue"]["value"] - - # Extract Wikipedia sitelink - sitelinks = entity.get("sitelinks", {}) - wikipedia = None - if "enwiki" in sitelinks: - wiki_title = sitelinks["enwiki"].get("title", "") - if wiki_title: - wikipedia = f"en:{wiki_title}" - - result = { - "wikidata_id": wikidata_id, - "name": name, - "description": description, - "centroid": {"lat": lat, "lon": lon} if lat and lon else None, - "population": population, - "instance_of": instance_of, - "osm_relation_id": osm_relation_id, - "source": "wikidata", - "extratags": { - "wikidata": wikidata_id, - }, - } - - if wikipedia: - result["extratags"]["wikipedia"] = wikipedia - - # Fetch boundary polygon from Nominatim if we have an OSM relation ID - boundary = None - if osm_relation_id: - try: - nom_resp = http_requests.get(NOMINATIM_URL, params={ - 'osmtype': 'R', - 'osmid': osm_relation_id, - 'format': 'json', - 'polygon_geojson': 1, - }, timeout=5) - if nom_resp.status_code == 200: - nom_data = nom_resp.json() - geom = nom_data.get('geometry') - if geom and geom.get('type') in ('Polygon', 'MultiPolygon'): - boundary = geom - logger.debug(f"Wikidata boundary hit for {wikidata_id}") - except Exception as e: - logger.debug(f"Wikidata boundary fetch failed: {e}") - - result["boundary"] = boundary - - logger.debug(f"Wikidata hit: {wikidata_id} -> {name}") - return result, 200 - - except Exception as e: - logger.warning(f"Wikidata error for {wikidata_id}: {e}") - return {"error": "Wikidata lookup failed"}, 502 diff --git a/lib/processors/zim_processor.py b/lib/processors/zim_processor.py index b258408..6f5c887 100644 --- a/lib/processors/zim_processor.py +++ b/lib/processors/zim_processor.py @@ -77,10 +77,73 @@ def _text_hash(text): return hashlib.md5(text.encode('utf-8')).hexdigest() +def _flatten_table(table_el): + """Convert a element to pipe-delimited text. + + Each becomes a row with cells joined by ' | '. + Returns the formatted table as a string with blank lines around it. + """ + rows = [] + for tr in table_el.iter('tr'): + cells = [] + for cell in tr: + if cell.tag in ('td', 'th'): + cell_text = (cell.text_content() or '').strip() + # Collapse internal whitespace in each cell + cell_text = re.sub(r'\s+', ' ', cell_text) + if cell_text: + cells.append(cell_text) + if cells: + rows.append(' | '.join(cells)) + if not rows: + return '' + return '\n'.join(rows) + + +def _preprocess_tree(doc): + """Pre-process HTML tree to add delimiters before text_content() flattens it. + + Handles:
,
,
  • ,
    ,
    -- elements that lxml's + text_content() would concatenate without any separators. + """ + from lxml import etree + + # 1. Replace
  • elements with their pipe-delimited text + for table in list(doc.iter('table')): + formatted = _flatten_table(table) + if formatted: + replacement = etree.Element('div') + replacement.text = '\n\n' + formatted + '\n\n' + parent = table.getparent() + if parent is not None: + parent.replace(table, replacement) + else: + table.drop_tree() + + # 2.
    -> inject newline + for br in list(doc.iter('br')): + br.tail = '\n' + (br.tail or '') + + # 3.
  • -> inject newline + "- " prefix + for li in list(doc.iter('li')): + li.text = '- ' + (li.text or '') + li.tail = '\n' + (li.tail or '') + + # 4.
    -> inject newline before + for dt in list(doc.iter('dt')): + dt.tail = '\n' + (dt.tail or '') + + # 5.
    -> inject newline + indent + for dd in list(doc.iter('dd')): + dd.text = ' ' + (dd.text or '') + dd.tail = '\n' + (dd.tail or '') + + def _html_to_text(html_bytes): """Convert HTML bytes to clean text via lxml. Strips nav, footer, script, style elements. Decodes entities. + Pre-processes tables, lists, and line breaks for proper delimiters. Normalizes whitespace. """ try: @@ -93,6 +156,9 @@ def _html_to_text(html_bytes): for el in doc.iter(tag): el.drop_tree() + # Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes + _preprocess_tree(doc) + # Extract text text = doc.text_content() diff --git a/lib/wiki_rewrite.py b/lib/wiki_rewrite.py deleted file mode 100644 index d884635..0000000 --- a/lib/wiki_rewrite.py +++ /dev/null @@ -1,324 +0,0 @@ -""" -Wiki link rewriter — rewrites OSM wikipedia/wikidata/wikivoyage/appropedia -links to local Kiwix URLs where the article exists in a loaded ZIM. - -Falls back silently to public URLs when article is unavailable locally. -Caches positive results only in place_cache.db. - -Kiwix catalog is parsed from the OPDS Atom feed at startup and refreshed -hourly to pick up newly loaded ZIMs without a restart. - -Operations note: - - After loading a new ZIM, either restart RECON (forces fresh catalog - fetch) or wait up to 1 hour for automatic refresh. - - To invalidate the wiki cache (e.g. after ZIM update): - sqlite3 /opt/recon/data/place_cache.db "DELETE FROM wiki_cache;" -""" -import os -import re -import sqlite3 -import time -import xml.etree.ElementTree as ET -from urllib.parse import unquote, quote - -import requests as http_requests - -from .utils import setup_logging - -logger = setup_logging('recon.wiki_rewrite') - -# ── Configuration ─────────────────────────────────────────────────────── - -KIWIX_BASE = "http://localhost:8430" -KIWIX_PUBLIC_BASE = "https://wiki.echo6.co" -KIWIX_CATALOG_URL = f"{KIWIX_BASE}/catalog/v2/entries" -HEAD_TIMEOUT = 1.5 # seconds -CATALOG_REFRESH_INTERVAL = 3600 # 1 hour - -# OPDS Atom namespace -_ATOM_NS = "http://www.w3.org/2005/Atom" - -# ── ZIM catalog map ───────────────────────────────────────────────────── - -_zim_map = {} # source_type → content_path e.g. 'wikipedia' → 'wikipedia_en_all_maxi_2026-02' -_zim_map_ts = 0.0 # last refresh timestamp - -# Prefix-to-source-type mapping (order matters: longest prefix first) -_ZIM_PREFIX_MAP = [ - ('wikipedia_en_all', 'wikipedia'), - ('appropedia_en_all', 'appropedia'), - ('wikivoyage_en', 'wikivoyage'), - ('wikidata_en', 'wikidata'), -] - - -def _discover_zims(): - """Parse Kiwix OPDS Atom catalog to map source types to content paths.""" - global _zim_map, _zim_map_ts - - try: - resp = http_requests.get(KIWIX_CATALOG_URL, timeout=5) - if resp.status_code != 200: - logger.warning(f"Kiwix catalog returned HTTP {resp.status_code}") - return - - root = ET.fromstring(resp.content) - new_map = {} - - for entry in root.findall(f"{{{_ATOM_NS}}}entry"): - name_el = entry.find(f"{{{_ATOM_NS}}}name") - if name_el is None: - continue - book_name = name_el.text or "" - - # - content_path = None - for link in entry.findall(f"{{{_ATOM_NS}}}link"): - if link.get("type") == "text/html": - href = link.get("href", "") - if href.startswith("/content/"): - content_path = href[len("/content/"):] - break - - if not content_path: - continue - - # Match book name against known prefixes - for prefix, source_type in _ZIM_PREFIX_MAP: - if book_name.startswith(prefix): - new_map[source_type] = content_path - break - - _zim_map = new_map - _zim_map_ts = time.time() - logger.info(f"ZIM catalog refreshed: {new_map}") - - except Exception as e: - logger.warning(f"Failed to discover ZIMs from Kiwix catalog: {e}") - - -def _ensure_zim_map(): - """Lazy-load and refresh ZIM map if stale.""" - if not _zim_map or (time.time() - _zim_map_ts) > CATALOG_REFRESH_INTERVAL: - _discover_zims() - - -# ── Database (wiki_cache in place_cache.db) ───────────────────────────── - -_db_conn = None - - -def _get_db(): - """Return a module-level SQLite connection to place_cache.db (lazy init).""" - global _db_conn - if _db_conn is not None: - return _db_conn - - db_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') - os.makedirs(db_dir, exist_ok=True) - db_path = os.path.join(db_dir, 'place_cache.db') - - _db_conn = sqlite3.connect(db_path, check_same_thread=False) - _db_conn.execute("PRAGMA journal_mode=WAL") - _db_conn.execute("PRAGMA synchronous=NORMAL") - _db_conn.execute(""" - CREATE TABLE IF NOT EXISTS wiki_cache ( - source_type TEXT NOT NULL, - article_id TEXT NOT NULL, - kiwix_url TEXT NOT NULL, - cached_at INTEGER NOT NULL, - PRIMARY KEY (source_type, article_id) - ) - """) - _db_conn.commit() - logger.info(f"Wiki cache table ready in {db_path}") - return _db_conn - - -# ── URL classification ────────────────────────────────────────────────── - -# Patterns for OSM wikipedia/wikidata tag values -_WIKI_TAG_RE = re.compile(r'^(?:en:)?(.+)$') # "en:Title" or just "Title" -_WIKI_URL_RE = re.compile(r'https?://en\.wikipedia\.org/wiki/(.+)') -_WIKIDATA_TAG_RE = re.compile(r'^(Q\d+)$') -_WIKIDATA_URL_RE = re.compile(r'https?://(?:www\.)?wikidata\.org/wiki/(Q\d+)') -_WIKIVOYAGE_URL_RE = re.compile(r'https?://en\.wikivoyage\.org/wiki/(.+)') -_APPROPEDIA_URL_RE = re.compile(r'https?://(?:www\.)?appropedia\.org/(?:wiki/)?(.+)') - - -def _normalize_article_id(article_id): - """Normalize article ID to MediaWiki/Kiwix convention: spaces → underscores.""" - return article_id.replace(' ', '_') - - -def classify_wiki_link(tag_name, value): - """ - Classify an OSM extratag value into (source_type, article_id) or None. - - tag_name: the extratags key ('wikipedia', 'wikidata', etc.) - value: the raw tag value from OSM - - Article IDs are normalized to MediaWiki convention (spaces → underscores). - """ - if not value or not isinstance(value, str): - return None - - value = value.strip() - - if tag_name == 'wikidata': - m = _WIKIDATA_TAG_RE.match(value) - if m: - return ('wikidata', m.group(1)) - m = _WIKIDATA_URL_RE.match(value) - if m: - return ('wikidata', m.group(1)) - return None - - if tag_name == 'wikipedia': - # URL form: https://en.wikipedia.org/wiki/Title - m = _WIKI_URL_RE.match(value) - if m: - return ('wikipedia', _normalize_article_id(unquote(m.group(1)))) - # Tag form: "en:Title" or "Title" - m = _WIKI_TAG_RE.match(value) - if m: - return ('wikipedia', _normalize_article_id(m.group(1))) - return None - - if tag_name == 'wikivoyage': - m = _WIKIVOYAGE_URL_RE.match(value) - if m: - return ('wikivoyage', _normalize_article_id(unquote(m.group(1)))) - # Plain tag: "en:Title" or "Title" - m = _WIKI_TAG_RE.match(value) - if m: - return ('wikivoyage', _normalize_article_id(m.group(1))) - return None - - if tag_name == 'appropedia': - m = _APPROPEDIA_URL_RE.match(value) - if m: - return ('appropedia', _normalize_article_id(unquote(m.group(1)))) - return ('appropedia', _normalize_article_id(value)) - - return None - - -# ── URL builders ──────────────────────────────────────────────────────── - -def build_kiwix_url(source_type, article_id): - """Build a public Kiwix URL. Returns None if source_type not in ZIM map.""" - _ensure_zim_map() - content_path = _zim_map.get(source_type) - if not content_path: - return None - return f"{KIWIX_PUBLIC_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" - - -_PUBLIC_URL_TEMPLATES = { - 'wikipedia': "https://en.wikipedia.org/wiki/{id}", - 'wikidata': "https://www.wikidata.org/wiki/{id}", - 'wikivoyage': "https://en.wikivoyage.org/wiki/{id}", - 'appropedia': "https://www.appropedia.org/wiki/{id}", -} - - -def build_public_url(source_type, article_id): - """Build the canonical public URL for a wiki article.""" - tmpl = _PUBLIC_URL_TEMPLATES.get(source_type) - if not tmpl: - return None - return tmpl.format(id=quote(article_id, safe='/:@!$&\'()*+,;=')) - - -# ── Kiwix availability check ─────────────────────────────────────────── - -def check_kiwix_has_article(source_type, article_id): - """ - Check if an article exists in local Kiwix. - - Returns (bool, url): - - (True, kiwix_public_url) if article exists locally - - (False, None) if not found or Kiwix unavailable - - Only positive results are cached. - """ - # Check cache first - db = _get_db() - row = db.execute( - "SELECT kiwix_url FROM wiki_cache WHERE source_type=? AND article_id=?", - (source_type, article_id) - ).fetchone() - if row: - return (True, row[0]) - - # Build local HEAD URL - _ensure_zim_map() - content_path = _zim_map.get(source_type) - if not content_path: - return (False, None) - - head_url = f"{KIWIX_BASE}/content/{content_path}/{quote(article_id, safe='/:@!$&\'()*+,;=')}" - - try: - resp = http_requests.head(head_url, timeout=HEAD_TIMEOUT, allow_redirects=True) - if resp.status_code == 200: - kiwix_url = build_kiwix_url(source_type, article_id) - # Cache positive result - now = int(time.time()) - db.execute(""" - INSERT OR REPLACE INTO wiki_cache (source_type, article_id, kiwix_url, cached_at) - VALUES (?, ?, ?, ?) - """, (source_type, article_id, kiwix_url, now)) - db.commit() - return (True, kiwix_url) - else: - return (False, None) - except Exception as e: - logger.debug(f"Kiwix HEAD failed for {source_type}/{article_id}: {e}") - return (False, None) - - -# ── Primary entry point ──────────────────────────────────────────────── - -def rewrite_wiki_link(tag_name, value): - """ - Rewrite an OSM wiki tag value to a local Kiwix URL if available. - - Returns (url, 'local'|'public') or (None, None) if unrecognized. - """ - classified = classify_wiki_link(tag_name, value) - if not classified: - return (value, 'original') - - source_type, article_id = classified - - # Try local Kiwix - found, kiwix_url = check_kiwix_has_article(source_type, article_id) - if found and kiwix_url: - return (kiwix_url, 'local') - - # Fall back to public URL - public_url = build_public_url(source_type, article_id) - if public_url: - return (public_url, 'public') - - return (value, 'original') - - -# ── Discovery stubs (disabled, for future activation) ─────────────────── - -def discover_wikivoyage_article(name, category, lat, lon): - """ - Discover a related Wikivoyage article for a place. - Enabled by has_wiki_discovery. Currently returns None. - """ - return None - - -def discover_appropedia_article(name, category): - """ - Discover a related Appropedia article for a place. - Enabled by has_wiki_discovery. Currently returns None. - """ - return None diff --git a/requirements.txt b/requirements.txt index f643cd8..1da21bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ anyio==4.12.1 babel==2.18.0 beautifulsoup4==4.14.3 blinker==1.9.0 +cachetools==7.1.3 certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 diff --git a/scripts/overture_import.py b/scripts/overture_import.py deleted file mode 100644 index 0b6ba67..0000000 --- a/scripts/overture_import.py +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/env python3 -"""Overture Maps Places → PostgreSQL import script (v2). - -Downloads Overture Places Parquet from S3 via DuckDB (public bucket, no credentials), -filters to North America bounding box, and inserts into local PostgreSQL with PostGIS. - -Usage: - cd /opt/recon && venv/bin/python scripts/overture_import.py - -Re-runnable (idempotent via UPSERT). -""" - -import json -import logging -import os -import re -import sys -import time - -import duckdb -import psycopg2 -import psycopg2.extras - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(levelname)s %(message)s', - datefmt='%H:%M:%S' -) -log = logging.getLogger('overture_import') - -# --- Config --- -OVERTURE_RELEASE = '2026-04-15.0' -S3_PATH = f's3://overturemaps-us-west-2/release/{OVERTURE_RELEASE}/theme=places/type=place/*' - -# North America bounding box (generous — includes Hawaii, Puerto Rico, Canada) -BBOX = { - 'xmin': -170.0, - 'xmax': -50.0, - 'ymin': 15.0, - 'ymax': 85.0, -} - -BATCH_SIZE = 50_000 -OSM_RECORD_RE = re.compile(r'^([nwr])(\d+)@\d+$') - -DB_CONFIG = { - 'host': os.environ.get('OVERTURE_DB_HOST', 'localhost'), - 'port': int(os.environ.get('OVERTURE_DB_PORT', '5432')), - 'dbname': os.environ.get('OVERTURE_DB_NAME', 'overture'), - 'user': os.environ.get('OVERTURE_DB_USER', 'overture'), - 'password': os.environ.get('OVERTURE_DB_PASSWORD', ''), -} - - -def create_table(conn): - """Create places table and indexes if they don't exist.""" - with conn.cursor() as cur: - cur.execute(""" - CREATE TABLE IF NOT EXISTS places ( - id TEXT PRIMARY KEY, - geometry GEOMETRY(Point, 4326), - name TEXT, - basic_category TEXT, - confidence REAL, - phone TEXT, - website TEXT, - socials JSONB, - brand_name TEXT, - brand_wikidata TEXT, - osm_type CHAR(1), - osm_id BIGINT, - source_record_id TEXT, - raw_sources JSONB - ); - """) - cur.execute(""" - CREATE INDEX IF NOT EXISTS idx_places_osm - ON places(osm_type, osm_id) WHERE osm_type IS NOT NULL; - """) - cur.execute(""" - CREATE INDEX IF NOT EXISTS idx_places_geom - ON places USING GIST(geometry); - """) - cur.execute(""" - CREATE INDEX IF NOT EXISTS idx_places_name_trgm - ON places USING GIN(name gin_trgm_ops); - """) - conn.commit() - log.info('Table and indexes ready') - - -def parse_osm_ref(sources): - """Extract OSM type letter and ID from Overture sources array.""" - if not sources: - return None, None, None - for src in sources: - record_id = None - if isinstance(src, dict): - record_id = src.get('record_id', '') - elif hasattr(src, '__getitem__'): - # DuckDB struct — try attribute access - try: - record_id = src['record_id'] - except (KeyError, TypeError, IndexError): - pass - if not record_id: - continue - m = OSM_RECORD_RE.match(str(record_id)) - if m: - return m.group(1), int(m.group(2)), str(record_id) - return None, None, None - - -def run_import(): - """Main import: DuckDB reads S3 Parquet → PostgreSQL via chunked OFFSET/LIMIT.""" - log.info(f'Overture release: {OVERTURE_RELEASE}') - log.info(f'S3 path: {S3_PATH}') - log.info(f'Bounding box: {BBOX}') - - # Connect to PostgreSQL - conn = psycopg2.connect(**DB_CONFIG) - conn.autocommit = False - create_table(conn) - - # Set up DuckDB with httpfs and spatial for S3 access - duck = duckdb.connect() - duck.execute("INSTALL httpfs; LOAD httpfs;") - duck.execute("INSTALL spatial; LOAD spatial;") - duck.execute("SET s3_region='us-west-2';") - - # Use a materialized approach: DuckDB query → Arrow → iterate in Python - query = f""" - SELECT - id, - ST_X(geometry) AS lon, - ST_Y(geometry) AS lat, - names.primary AS name, - basic_category, - confidence, - phones, - websites, - socials, - brand, - sources - FROM read_parquet('{S3_PATH}', hive_partitioning=true) - WHERE bbox.xmin >= {BBOX['xmin']} - AND bbox.xmax <= {BBOX['xmax']} - AND bbox.ymin >= {BBOX['ymin']} - AND bbox.ymax <= {BBOX['ymax']} - """ - - log.info('Starting DuckDB query against S3 (this will take several minutes)...') - t_start = time.time() - - # Execute and fetch all as Arrow for efficient iteration - result_rel = duck.sql(query) - - upsert_sql = """ - INSERT INTO places (id, geometry, name, basic_category, confidence, - phone, website, socials, brand_name, brand_wikidata, - osm_type, osm_id, source_record_id, raw_sources) - VALUES %s - ON CONFLICT (id) DO UPDATE SET - geometry = EXCLUDED.geometry, - name = EXCLUDED.name, - basic_category = EXCLUDED.basic_category, - confidence = EXCLUDED.confidence, - phone = EXCLUDED.phone, - website = EXCLUDED.website, - socials = EXCLUDED.socials, - brand_name = EXCLUDED.brand_name, - brand_wikidata = EXCLUDED.brand_wikidata, - osm_type = EXCLUDED.osm_type, - osm_id = EXCLUDED.osm_id, - source_record_id = EXCLUDED.source_record_id, - raw_sources = EXCLUDED.raw_sources - """ - - template = """( - %(id)s, - ST_SetSRID(ST_MakePoint(%(lon)s, %(lat)s), 4326), - %(name)s, - %(basic_category)s, - %(confidence)s, - %(phone)s, - %(website)s, - %(socials)s::jsonb, - %(brand_name)s, - %(brand_wikidata)s, - %(osm_type)s, - %(osm_id)s, - %(source_record_id)s, - %(raw_sources)s::jsonb - )""" - - total = 0 - osm_refs = 0 - batch = [] - - log.info('DuckDB query executing, fetching results in chunks...') - - # Fetch in chunks using fetchmany on the relation - chunk_size = BATCH_SIZE - while True: - chunk = result_rel.fetchmany(chunk_size) - if not chunk: - break - - for row in chunk: - row_id = row[0] - lon = row[1] - lat = row[2] - name = row[3] - basic_cat = row[4] - conf = row[5] - phones = row[6] - websites = row[7] - socials_raw = row[8] - brand_raw = row[9] - sources_raw = row[10] - - if lon is None or lat is None: - continue - - # Phone: first element of VARCHAR[] - phone = None - if phones and len(phones) > 0: - phone = str(phones[0]) if phones[0] else None - - # Website: first element of VARCHAR[] - website = None - if websites and len(websites) > 0: - website = str(websites[0]) if websites[0] else None - - # Socials: VARCHAR[] → JSON array of strings - socials_json = None - if socials_raw and len(socials_raw) > 0: - socials_json = json.dumps([str(s) for s in socials_raw if s]) - - # Brand: struct with wikidata and names.primary - brand_name = None - brand_wikidata = None - if brand_raw: - try: - if isinstance(brand_raw, dict): - brand_wikidata = brand_raw.get('wikidata') - names_struct = brand_raw.get('names') - if names_struct and isinstance(names_struct, dict): - brand_name = names_struct.get('primary') - else: - # DuckDB struct — access by key - brand_wikidata = brand_raw['wikidata'] if 'wikidata' in dir(brand_raw) else None - try: - brand_wikidata = brand_raw[0] # wikidata is first field - names_struct = brand_raw[1] # names is second field - if names_struct: - brand_name = names_struct[0] # primary is first field - except (IndexError, TypeError): - pass - except Exception: - pass - - # Sources: parse OSM cross-reference - sources_list = None - if sources_raw: - if isinstance(sources_raw, (list, tuple)): - sources_list = [] - for s in sources_raw: - if isinstance(s, dict): - sources_list.append(s) - else: - # DuckDB struct tuple — convert - try: - sources_list.append({ - 'dataset': s[1] if len(s) > 1 else None, - 'record_id': s[3] if len(s) > 3 else None, - }) - except (TypeError, IndexError): - pass - - osm_type_letter, osm_id_val, source_record_id = parse_osm_ref(sources_list) - if osm_type_letter: - osm_refs += 1 - - raw_sources_json = json.dumps(sources_list) if sources_list else None - - batch.append({ - 'id': row_id, - 'lon': float(lon), - 'lat': float(lat), - 'name': name, - 'basic_category': basic_cat, - 'confidence': float(conf) if conf is not None else None, - 'phone': phone, - 'website': website, - 'socials': socials_json, - 'brand_name': brand_name, - 'brand_wikidata': brand_wikidata, - 'osm_type': osm_type_letter, - 'osm_id': osm_id_val, - 'source_record_id': source_record_id, - 'raw_sources': raw_sources_json, - }) - - if len(batch) >= BATCH_SIZE: - with conn.cursor() as cur: - psycopg2.extras.execute_values( - cur, upsert_sql, batch, - template=template, - page_size=BATCH_SIZE - ) - conn.commit() - total += len(batch) - elapsed = time.time() - t_start - rate = total / elapsed if elapsed > 0 else 0 - log.info(f'Inserted {total:,} rows ({osm_refs:,} OSM xrefs) ' - f'[{rate:.0f} rows/sec, {elapsed:.0f}s elapsed]') - batch = [] - - # Flush remaining - if batch: - with conn.cursor() as cur: - psycopg2.extras.execute_values( - cur, upsert_sql, batch, - template=template, - page_size=BATCH_SIZE - ) - conn.commit() - total += len(batch) - - duck.close() - - # Final stats - elapsed = time.time() - t_start - log.info(f'Import complete: {total:,} rows, {osm_refs:,} OSM cross-refs, ' - f'{elapsed:.0f}s total ({total/elapsed:.0f} rows/sec)') - - # Verify - with conn.cursor() as cur: - cur.execute("SELECT count(*) FROM places") - count = cur.fetchone()[0] - cur.execute("SELECT count(*) FROM places WHERE osm_type IS NOT NULL") - osm_count = cur.fetchone()[0] - log.info(f'Final table: {count:,} total rows, {osm_count:,} with OSM cross-references') - - conn.close() - - -if __name__ == '__main__': - run_import() diff --git a/templates/base.html b/templates/base.html index 4c06892..49b1a21 100644 --- a/templates/base.html +++ b/templates/base.html @@ -21,7 +21,6 @@ PeerTube Kiwix Search - Nav-I Settings {% if subnav %} diff --git a/templates/knowledge/deleted_contacts.html b/templates/knowledge/deleted_contacts.html deleted file mode 100644 index 58a9ff5..0000000 --- a/templates/knowledge/deleted_contacts.html +++ /dev/null @@ -1,56 +0,0 @@ -{% extends "base.html" %} -{% block content %} -

    Deleted Contacts

    -{% if not contacts %} -

    No deleted contacts.

    -{% else %} -
  • - - {% for c in contacts %} - - - - - - - - - {% endfor %} -
    LabelNameCategoryPhoneDeleted AtActions
    {{ c.label }}{{ c.name or '' }}{{ c.category or '' }}{{ c.phone or '' }}{{ c.deleted_at or '' }} - - -
    -{% endif %} -{% endblock %} -{% block scripts %} - -{% endblock %} diff --git a/templates/navi/api_keys.html b/templates/navi/api_keys.html deleted file mode 100644 index abf2d16..0000000 --- a/templates/navi/api_keys.html +++ /dev/null @@ -1,269 +0,0 @@ -{% extends "base.html" %} -{% block content %} -

    API Keys

    - -
    -

    Updating keys does not restart RECON. After updates, click Restart RECON below or restart manually from terminal.

    -
    - -
    Loading keys...
    - - - - - - - - - - - -
    - - -
    - - - -{% endblock %} - -{% block scripts %} - -{% endblock %} diff --git a/templates/navi/deleted_contacts.html b/templates/navi/deleted_contacts.html deleted file mode 100644 index 0847fab..0000000 --- a/templates/navi/deleted_contacts.html +++ /dev/null @@ -1,116 +0,0 @@ -{% extends "base.html" %} -{% block content %} -

    Deleted Contacts

    -{% if not contacts %} -

    No deleted contacts.

    -{% else %} - - - {% for c in contacts %} - - - - - - - - - {% endfor %} -
    LabelNameCategoryPhoneDeleted AtActions
    {{ c.label }}{{ c.name or '' }}{{ c.category or '' }}{{ c.phone or '' }}{{ c.deleted_at or '' }} - - -
    -{% endif %} - - - -{% endblock %} -{% block scripts %} - -{% endblock %} diff --git a/templates/navi/landing.html b/templates/navi/landing.html deleted file mode 100644 index 131f3af..0000000 --- a/templates/navi/landing.html +++ /dev/null @@ -1,22 +0,0 @@ -{% extends "base.html" %} -{% block content %} -

    Nav-I

    -

    Navi frontend management — contacts, API keys, and configuration.

    - - -{% endblock %}