mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-06-10 17:04:39 +02:00
Compare commits
83 commits
feature/sc
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
e840a119dd |
|||
|
6365fe6756 |
|||
|
ac99723e51 |
|||
|
21c0f11eff |
|||
|
879df84b7a |
|||
|
aa6e972260 |
|||
|
79d7b2b343 |
|||
|
adee6d5a69 |
|||
|
86c902f7b5 |
|||
|
1f05d4b4d6 |
|||
|
d7292c4cc7 |
|||
|
d56b1d5f92 |
|||
|
c968497b94 |
|||
|
ed36eec85e |
|||
|
14ad2cd34a |
|||
|
f42b1fef3b |
|||
|
cf74f1840b |
|||
|
25cf5ac16a |
|||
| bb220b7ba3 | |||
| 75664c7d02 | |||
|
f7a501b4d7 |
|||
| dcd4ddd358 | |||
| f67f4ec9e3 | |||
|
dc7591b101 |
|||
| 484dfbd1e0 | |||
|
573347a2ee |
|||
| 3d2d69cd56 | |||
|
a80bb6e848 |
|||
| f276b95753 | |||
| c1ba1f8dc7 | |||
| a04c10ad55 | |||
| d8f84ab55a | |||
| b4e33eb048 | |||
| 05c24f95f6 | |||
| 686b35710a | |||
| cf758476b4 | |||
| 87a4741b8d | |||
| 58347415bc | |||
| ff0721c23e | |||
| 2252905986 | |||
| bc463188d5 | |||
| 1a9dfc8f8d | |||
| 3293cb4238 | |||
| e0eedcedfd | |||
| 26d4bc7478 | |||
| f2a0f81580 | |||
| 227affca9d | |||
| fa456fecb1 | |||
| 83a21854c3 | |||
| b741e217f6 | |||
| 991826b4f1 | |||
| 121eb45b44 | |||
| b5de9c6e39 | |||
| 2387a96a1e | |||
| e9c9cee4f3 | |||
| 07e6d0460b | |||
| 4f96d8f6fe | |||
| 2ed9335f4e | |||
| f35af18320 | |||
| 63b68bfea7 | |||
| 15c58a69ac | |||
| 829bc87b7b | |||
| 9c5b0520f9 | |||
| 3280e34718 | |||
| a4288c0cd8 | |||
| 095bf8c2af | |||
| 620f99c762 | |||
| d460f0e202 | |||
| 65693d15aa | |||
| 2121ee4936 | |||
| 64605b38bb | |||
| e6b81db520 | |||
| d4c5c371ca | |||
| ac69e2761d | |||
| 87b230dcba | |||
| c76d63b785 | |||
| a14501347b | |||
| dfab388769 | |||
| 23483e8198 | |||
| 3243f2f252 | |||
| 9841c38011 | |||
| a9510b5ed9 | |||
| c5283ece3e |
16 changed files with 1073 additions and 9 deletions
18
config/address_book.yaml
Normal file
18
config/address_book.yaml
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
# RECON Address Book — saved locations for navigation shortcuts.
|
||||||
|
# Entries are matched by name and aliases (case-insensitive).
|
||||||
|
# Add new entries by appending to the list below.
|
||||||
|
|
||||||
|
entries:
|
||||||
|
- id: home
|
||||||
|
name: Home
|
||||||
|
aliases:
|
||||||
|
- home
|
||||||
|
- matt's house
|
||||||
|
- 214 north st
|
||||||
|
- 214 north street
|
||||||
|
address: "214 North St, Filer, ID 83328"
|
||||||
|
lat: 42.5735833
|
||||||
|
lon: -114.6066389
|
||||||
|
tags:
|
||||||
|
- residence
|
||||||
|
- primary
|
||||||
67
config/profiles/home.yaml
Normal file
67
config/profiles/home.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
# Deployment profile: Home (VM 1130)
|
||||||
|
# Active on the main Echo6 deployment. Full stack with planet-scale NA tiles.
|
||||||
|
# Override via RECON_PROFILE env var in /etc/systemd/system/recon.service
|
||||||
|
|
||||||
|
profile: home
|
||||||
|
region_name: "North America"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/planet/current.pmtiles"
|
||||||
|
bounds: [-168, 14, -52, 72]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/planet-dem.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
place_details:
|
||||||
|
local_source: "nominatim"
|
||||||
|
local_bbox: [-125.0, 31.3, -104.0, 49.0]
|
||||||
|
fallback_source: "overpass"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: true
|
||||||
|
has_kiwix_wiki: true
|
||||||
|
has_hillshade: true
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: true
|
||||||
|
has_landclass: true
|
||||||
|
has_public_lands_layer: true
|
||||||
|
has_contours: true
|
||||||
|
has_contours_test: false
|
||||||
|
has_contours_test_10ft: false
|
||||||
|
has_address_book_write: false
|
||||||
|
has_overture_enrichment: true
|
||||||
|
has_google_places_enrichment: true
|
||||||
|
has_contacts: true
|
||||||
|
has_wiki_rewriting: true
|
||||||
|
has_wiki_discovery: false
|
||||||
|
has_usfs_trails: true
|
||||||
|
has_blm_trails: true
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [42.5736, -114.6066]
|
||||||
|
zoom: 10
|
||||||
|
|
||||||
|
# Offroute wilderness routing
|
||||||
|
offroute:
|
||||||
|
osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf"
|
||||||
|
densify_interval_m: 100
|
||||||
|
postgis_dsn: "dbname=padus"
|
||||||
|
|
||||||
51
config/profiles/minimal_pi.yaml
Normal file
51
config/profiles/minimal_pi.yaml
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
# Deployment profile: Minimal Pi (single-state pocket deployment)
|
||||||
|
# Template for the lightest possible field kit — Idaho only.
|
||||||
|
# Override via RECON_PROFILE env var.
|
||||||
|
|
||||||
|
profile: minimal_pi
|
||||||
|
region_name: "Idaho"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/idaho.pmtiles"
|
||||||
|
bounds: [-117.5, 42.0, -111.0, 49.0]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/hillshade-idaho.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
# TODO(matt): confirm logout next= host for this profile
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: false
|
||||||
|
has_kiwix_wiki: false
|
||||||
|
has_hillshade: false
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: false
|
||||||
|
has_landclass: false
|
||||||
|
has_public_lands_layer: false
|
||||||
|
has_address_book_write: true
|
||||||
|
has_overture_enrichment: false
|
||||||
|
has_google_places_enrichment: false
|
||||||
|
has_contacts: false
|
||||||
|
has_wiki_rewriting: false
|
||||||
|
has_wiki_discovery: false
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [44.0, -114.0]
|
||||||
|
zoom: 7
|
||||||
59
config/profiles/regional_pi.yaml
Normal file
59
config/profiles/regional_pi.yaml
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
# Deployment profile: Regional Pi (multi-state field kit)
|
||||||
|
# Template for a Raspberry Pi covering Idaho + surrounding states.
|
||||||
|
# Override via RECON_PROFILE env var.
|
||||||
|
|
||||||
|
profile: regional_pi
|
||||||
|
region_name: "Idaho + Neighbors"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/regional.pmtiles"
|
||||||
|
bounds: [-125, 40, -104, 49]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/hillshade-regional.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
place_details:
|
||||||
|
local_source: "nominatim"
|
||||||
|
local_bbox: [-125.0, 40.0, -104.0, 49.0]
|
||||||
|
fallback_source: "overpass"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
# TODO(matt): confirm logout next= host for this profile
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: true
|
||||||
|
has_kiwix_wiki: false
|
||||||
|
has_hillshade: true
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: true
|
||||||
|
has_landclass: true
|
||||||
|
has_public_lands_layer: true
|
||||||
|
has_contours: true
|
||||||
|
has_contours_test: true
|
||||||
|
has_contours_test_10ft: true
|
||||||
|
has_address_book_write: true
|
||||||
|
has_overture_enrichment: false
|
||||||
|
has_google_places_enrichment: false
|
||||||
|
has_contacts: false
|
||||||
|
has_wiki_rewriting: true
|
||||||
|
has_wiki_discovery: false
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [44.0, -114.0]
|
||||||
|
zoom: 7
|
||||||
|
|
@ -57,6 +57,10 @@ class _LargeZimRequest(_FlaskRequest):
|
||||||
return super()._get_file_stream(total_content_length, content_type, filename, content_length)
|
return super()._get_file_stream(total_content_length, content_type, filename, content_length)
|
||||||
|
|
||||||
app.request_class = _LargeZimRequest
|
app.request_class = _LargeZimRequest
|
||||||
|
# ── Netsyms Blueprint ──
|
||||||
|
from .netsyms_api import netsyms_bp
|
||||||
|
app.register_blueprint(netsyms_bp)
|
||||||
|
|
||||||
|
|
||||||
# ── Navigation Constants ──
|
# ── Navigation Constants ──
|
||||||
|
|
||||||
|
|
@ -1315,6 +1319,9 @@ def api_keys_reload():
|
||||||
return jsonify({'count': count})
|
return jsonify({'count': count})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ── YouTube Cookie Management ──
|
# ── YouTube Cookie Management ──
|
||||||
|
|
||||||
PEERTUBE_HOST = '192.168.1.170'
|
PEERTUBE_HOST = '192.168.1.170'
|
||||||
|
|
|
||||||
117
lib/aurora_nav_tool.py
Normal file
117
lib/aurora_nav_tool.py
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
"""
|
||||||
|
title: Navigation
|
||||||
|
author: Echo6
|
||||||
|
version: 1.1.0
|
||||||
|
description: Turn-by-turn directions and geocoding via Photon + Valhalla on recon-vm. Supports driving, walking, cycling, and truck routing with worldwide coverage (281M places).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
_COORD_RE = re.compile(r'^(-?\d+\.?\d*)\s*,\s*(-?\d+\.?\d*)$')
|
||||||
|
|
||||||
|
|
||||||
|
class Tools:
|
||||||
|
class Valves(BaseModel):
|
||||||
|
photon_url: str = Field(
|
||||||
|
default="http://100.64.0.24:2322",
|
||||||
|
description="Photon geocoding service URL (recon-vm)",
|
||||||
|
)
|
||||||
|
valhalla_url: str = Field(
|
||||||
|
default="http://100.64.0.24:8002",
|
||||||
|
description="Valhalla routing service URL (recon-vm)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.valves = self.Valves()
|
||||||
|
|
||||||
|
def _geocode(self, query: str):
|
||||||
|
m = _COORD_RE.match(query.strip())
|
||||||
|
if m:
|
||||||
|
lat, lon = float(m.group(1)), float(m.group(2))
|
||||||
|
return lat, lon, query
|
||||||
|
resp = requests.get(
|
||||||
|
f"{self.valves.photon_url}/api",
|
||||||
|
params={"q": query, "limit": 1},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
features = resp.json().get("features", [])
|
||||||
|
if not features:
|
||||||
|
return None, None, None
|
||||||
|
props = features[0]["properties"]
|
||||||
|
coords = features[0]["geometry"]["coordinates"]
|
||||||
|
parts = [props.get("name", "")]
|
||||||
|
for key in ("city", "state", "country"):
|
||||||
|
v = props.get(key)
|
||||||
|
if v and v != parts[-1]:
|
||||||
|
parts.append(v)
|
||||||
|
return coords[1], coords[0], ", ".join(p for p in parts if p)
|
||||||
|
|
||||||
|
def get_directions(
|
||||||
|
self,
|
||||||
|
origin: str,
|
||||||
|
destination: str,
|
||||||
|
mode: str = "auto",
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Get turn-by-turn directions between two locations. When this tool returns results, present the directions exactly as returned — do not summarize or rephrase. Include all steps.
|
||||||
|
|
||||||
|
:param origin: Starting location — address, place name, or lat,lon coordinates
|
||||||
|
:param destination: Destination — address, place name, or lat,lon coordinates
|
||||||
|
:param mode: Travel mode: auto, pedestrian, bicycle, or truck (default: auto)
|
||||||
|
:return: Formatted turn-by-turn directions
|
||||||
|
"""
|
||||||
|
if mode not in ("auto", "pedestrian", "bicycle", "truck"):
|
||||||
|
mode = "auto"
|
||||||
|
|
||||||
|
orig_lat, orig_lon, orig_name = self._geocode(origin)
|
||||||
|
if orig_lat is None:
|
||||||
|
return f"Could not find location: {origin}"
|
||||||
|
|
||||||
|
dest_lat, dest_lon, dest_name = self._geocode(destination)
|
||||||
|
if dest_lat is None:
|
||||||
|
return f"Could not find location: {destination}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{self.valves.valhalla_url}/route",
|
||||||
|
json={
|
||||||
|
"locations": [
|
||||||
|
{"lat": orig_lat, "lon": orig_lon},
|
||||||
|
{"lat": dest_lat, "lon": dest_lon},
|
||||||
|
],
|
||||||
|
"costing": mode,
|
||||||
|
"directions_options": {"units": "miles"},
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
except requests.RequestException:
|
||||||
|
return "Navigation service unavailable"
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return "No route found between locations"
|
||||||
|
|
||||||
|
trip = resp.json()["trip"]
|
||||||
|
summary = trip["summary"]
|
||||||
|
legs = trip["legs"][0]["maneuvers"]
|
||||||
|
|
||||||
|
miles = round(summary["length"], 1)
|
||||||
|
minutes = round(summary["time"] / 60, 1)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"Directions from {orig_name} to {dest_name} ({mode}):",
|
||||||
|
f"Distance: {miles} miles | Time: {minutes} minutes",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
for i, m in enumerate(legs, 1):
|
||||||
|
inst = m["instruction"]
|
||||||
|
dist = m.get("length", 0)
|
||||||
|
if dist > 0:
|
||||||
|
lines.append(f"{i}. {inst} — {round(dist, 1)} mi")
|
||||||
|
else:
|
||||||
|
lines.append(f"{i}. {inst}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
22
lib/auth.py
Normal file
22
lib/auth.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
RECON Auth Helper — extract user identity from Authentik forward-auth headers.
|
||||||
|
"""
|
||||||
|
from functools import wraps
|
||||||
|
from flask import request, jsonify
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_id():
|
||||||
|
"""Return X-Authentik-Username or None."""
|
||||||
|
return request.headers.get('X-Authentik-Username')
|
||||||
|
|
||||||
|
|
||||||
|
def require_auth(f):
|
||||||
|
"""Decorator: 401 if no Authentik auth header."""
|
||||||
|
@wraps(f)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
user_id = get_user_id()
|
||||||
|
if not user_id:
|
||||||
|
return jsonify({'error': 'Authentication required'}), 401
|
||||||
|
request.user_id = user_id
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
return wrapper
|
||||||
54
lib/deployment_config.py
Normal file
54
lib/deployment_config.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
Deployment profile loader.
|
||||||
|
|
||||||
|
Reads RECON_PROFILE env var (default: "home"), loads the matching YAML
|
||||||
|
from config/profiles/<profile>.yaml, and caches the parsed dict in memory.
|
||||||
|
|
||||||
|
Exposes get_deployment_config() as the in-process accessor for the profile.
|
||||||
|
|
||||||
|
Note: its former consumers (the /api/landclass gate, google_places,
|
||||||
|
place_detail, offroute/router) were all extracted to navi-* services or removed
|
||||||
|
across cleanups #4–#6/#27 — recon has no remaining caller of
|
||||||
|
get_deployment_config() today; the module is retained per cleanup #1.
|
||||||
|
(The former /api/config HTTP endpoint that served this dict to the frontend was
|
||||||
|
removed once navi-config (:8422) took over that route.)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.deployment_config')
|
||||||
|
|
||||||
|
_config_cache = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_deployment_config():
|
||||||
|
"""Load and cache the deployment profile. Called once at import time."""
|
||||||
|
global _config_cache
|
||||||
|
|
||||||
|
profile = os.environ.get('RECON_PROFILE', 'home')
|
||||||
|
config_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'config', 'profiles')
|
||||||
|
config_path = os.path.join(config_dir, f'{profile}.yaml')
|
||||||
|
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Deployment profile '{profile}' not found at {config_path}. "
|
||||||
|
f"Available profiles: {', '.join(f.replace('.yaml','') for f in os.listdir(config_dir) if f.endswith('.yaml'))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
_config_cache = yaml.safe_load(f)
|
||||||
|
|
||||||
|
logger.info(f"Loaded deployment profile: {profile} ({_config_cache.get('region_name', 'unknown')})")
|
||||||
|
return _config_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_deployment_config():
|
||||||
|
"""Return the cached deployment config dict."""
|
||||||
|
if _config_cache is None:
|
||||||
|
load_deployment_config()
|
||||||
|
return _config_cache
|
||||||
|
|
||||||
|
|
||||||
|
# Load on import so startup fails fast if profile is missing
|
||||||
|
load_deployment_config()
|
||||||
|
|
@ -21,6 +21,7 @@ Config: processing.extract_workers, processing.max_pdf_size_mb,
|
||||||
processing.extract_timeout, processing.page_timeout
|
processing.extract_timeout, processing.page_timeout
|
||||||
"""
|
"""
|
||||||
import base64
|
import base64
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
@ -99,6 +100,40 @@ def _is_transient(error_str):
|
||||||
return any(sig in s for sig in transient_signals)
|
return any(sig in s for sig in transient_signals)
|
||||||
|
|
||||||
|
|
||||||
|
def _text_quality_ok(text, min_length=50):
|
||||||
|
"""Check if extracted text meets quality thresholds.
|
||||||
|
|
||||||
|
Beyond the basic length check, validates:
|
||||||
|
- Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars)
|
||||||
|
- Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count
|
||||||
|
|
||||||
|
Returns True if text passes all checks.
|
||||||
|
"""
|
||||||
|
text = text.strip()
|
||||||
|
if len(text) < min_length:
|
||||||
|
return False
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Word-like ratio: tokens with 2+ alphabetic characters
|
||||||
|
word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2)
|
||||||
|
word_ratio = word_like / len(words)
|
||||||
|
if word_ratio < 0.60:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Concatenation detector: lowercase immediately followed by uppercase
|
||||||
|
# Filter out common camelCase patterns in code (short tokens)
|
||||||
|
concat_hits = len(re.findall(r'[a-z][A-Z]', text))
|
||||||
|
concat_ratio = concat_hits / len(words) if words else 0
|
||||||
|
if concat_ratio > 0.10:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30):
|
def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30):
|
||||||
"""Render a single PDF page to PNG bytes using pdftoppm.
|
"""Render a single PDF page to PNG bytes using pdftoppm.
|
||||||
|
|
||||||
|
|
@ -224,7 +259,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
# Method 1: pdftotext (poppler)
|
# Method 1: pdftotext (poppler)
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pdftotext', '-f', str(page_num_0indexed + 1),
|
['pdftotext', '-layout', '-f', str(page_num_0indexed + 1),
|
||||||
'-l', str(page_num_0indexed + 1), pdf_path, '-'],
|
'-l', str(page_num_0indexed + 1), pdf_path, '-'],
|
||||||
capture_output=True, text=True, timeout=page_timeout
|
capture_output=True, text=True, timeout=page_timeout
|
||||||
)
|
)
|
||||||
|
|
@ -233,7 +268,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pdftotext'
|
return text, 'pdftotext'
|
||||||
|
|
||||||
# Method 2: pdftoppm + Tesseract OCR
|
# Method 2: pdftoppm + Tesseract OCR
|
||||||
|
|
@ -258,7 +293,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'tesseract'
|
return text, 'tesseract'
|
||||||
|
|
||||||
# Method 3: Gemini Vision (last resort)
|
# Method 3: Gemini Vision (last resort)
|
||||||
|
|
@ -276,8 +311,26 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
# ── Core extraction functions ──
|
# ── Core extraction functions ──
|
||||||
|
|
||||||
def _pypdf2_extract(reader, page_num):
|
def _pypdf2_extract(reader, page_num):
|
||||||
"""Extract text from a PyPDF2 page object. Runs inside a thread for timeout."""
|
"""Extract text from a PyPDF2 page object. Runs inside a thread for timeout.
|
||||||
return reader.pages[page_num].extract_text() or ''
|
|
||||||
|
Tries default extraction first (space_width=200). If quality check fails,
|
||||||
|
retries with space_width=100 which better detects word boundaries in
|
||||||
|
tightly-kerned PDFs (common in Haynes/workshop manuals).
|
||||||
|
|
||||||
|
Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter
|
||||||
|
controls word-boundary detection tolerance. Lower values = more aggressive
|
||||||
|
space insertion between characters.
|
||||||
|
"""
|
||||||
|
text = reader.pages[page_num].extract_text() or ''
|
||||||
|
if _text_quality_ok(text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Retry with tighter word-boundary detection
|
||||||
|
text_tight = reader.pages[page_num].extract_text(space_width=100.0) or ''
|
||||||
|
if len(text_tight.strip()) >= len(text.strip()):
|
||||||
|
return text_tight
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
|
|
@ -302,13 +355,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
text = ''
|
text = ''
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pypdf2'
|
return text, 'pypdf2'
|
||||||
|
|
||||||
# Method 2: pdftotext via subprocess (inherently timeout-safe)
|
# Method 2: pdftotext via subprocess (inherently timeout-safe)
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'],
|
['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'],
|
||||||
capture_output=True, text=True, timeout=page_timeout
|
capture_output=True, text=True, timeout=page_timeout
|
||||||
)
|
)
|
||||||
if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()):
|
if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()):
|
||||||
|
|
@ -316,7 +369,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pdftotext'
|
return text, 'pdftotext'
|
||||||
|
|
||||||
# Method 3: pdftoppm + Tesseract OCR
|
# Method 3: pdftoppm + Tesseract OCR
|
||||||
|
|
@ -340,7 +393,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'tesseract'
|
return text, 'tesseract'
|
||||||
|
|
||||||
# Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs)
|
# Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs)
|
||||||
|
|
|
||||||
228
lib/netsyms.py
Normal file
228
lib/netsyms.py
Normal file
|
|
@ -0,0 +1,228 @@
|
||||||
|
"""
|
||||||
|
RECON Netsyms AddressDatabase2025 — SQLite-backed US+CA address lookup.
|
||||||
|
|
||||||
|
Provides 159.78M geocoded addresses as tier-2 between address book
|
||||||
|
(exact named locations) and Photon (full-text global geocoding).
|
||||||
|
|
||||||
|
Database: /mnt/nav/addresses/AddressDatabase2025.sqlite (read-only)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.netsyms')
|
||||||
|
|
||||||
|
_DB_PATH = '/mnt/nav/addresses/AddressDatabase2025.sqlite'
|
||||||
|
|
||||||
|
_conn = None
|
||||||
|
_lock = threading.Lock()
|
||||||
|
_cached_row_count = None
|
||||||
|
|
||||||
|
# US states + DC + territories, CA provinces, for free-text parsing
|
||||||
|
_STATE_CODES = {
|
||||||
|
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
|
||||||
|
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
|
||||||
|
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
|
||||||
|
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
|
||||||
|
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY',
|
||||||
|
'DC', 'PR', 'VI', 'GU', 'AS', 'MP',
|
||||||
|
# Canadian provinces
|
||||||
|
'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE',
|
||||||
|
'QC', 'SK', 'YT',
|
||||||
|
}
|
||||||
|
|
||||||
|
_NUMBER_RE = re.compile(r'^(\d+[\w-]*)(.*)$')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_conn():
|
||||||
|
"""Lazy-open a read-only SQLite connection."""
|
||||||
|
global _conn
|
||||||
|
if _conn is not None:
|
||||||
|
return _conn
|
||||||
|
with _lock:
|
||||||
|
if _conn is not None:
|
||||||
|
return _conn
|
||||||
|
uri = f'file:{_DB_PATH}?mode=ro'
|
||||||
|
_conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
|
||||||
|
_conn.row_factory = sqlite3.Row
|
||||||
|
logger.info("Netsyms DB opened: %s", _DB_PATH)
|
||||||
|
return _conn
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_dict(row):
|
||||||
|
"""Convert a sqlite3.Row to a plain dict with lat/lon keys."""
|
||||||
|
return {
|
||||||
|
'zipcode': row['zipcode'],
|
||||||
|
'number': row['number'],
|
||||||
|
'street': row['street'],
|
||||||
|
'street2': row['street2'],
|
||||||
|
'city': row['city'],
|
||||||
|
'state': row['state'],
|
||||||
|
'plus4': row['plus4'],
|
||||||
|
'country': row['country'],
|
||||||
|
'lat': float(row['latitude']),
|
||||||
|
'lon': float(row['longitude']),
|
||||||
|
'source': row['source'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_by_street(number, street, city=None, state=None,
|
||||||
|
zipcode=None, country=None, limit=20):
|
||||||
|
"""Match on number + street, with optional qualifiers."""
|
||||||
|
conn = _get_conn()
|
||||||
|
clauses = ['number = ?', 'street = ?']
|
||||||
|
params = [str(number).strip().upper(), street.strip().upper()]
|
||||||
|
|
||||||
|
if city:
|
||||||
|
clauses.append('city = ?')
|
||||||
|
params.append(city.strip().upper())
|
||||||
|
if state:
|
||||||
|
clauses.append('state = ?')
|
||||||
|
params.append(state.strip().upper())
|
||||||
|
if zipcode:
|
||||||
|
clauses.append('zipcode = ?')
|
||||||
|
params.append(zipcode.strip())
|
||||||
|
if country:
|
||||||
|
clauses.append('country = ?')
|
||||||
|
params.append(country.strip().upper())
|
||||||
|
|
||||||
|
sql = f"SELECT * FROM addresses WHERE {' AND '.join(clauses)} LIMIT ?"
|
||||||
|
params.append(limit)
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.warning("Netsyms lookup_by_street error: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [_row_to_dict(r) for r in rows]
|
||||||
|
logger.debug("lookup_by_street(%s, %s, city=%s, state=%s) → %d results",
|
||||||
|
number, street, city, state, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_free_text(query, country_hint=None):
|
||||||
|
"""Parse a free-text address and look it up."""
|
||||||
|
q = query.strip()
|
||||||
|
if not q:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Strip trailing zipcode if present
|
||||||
|
zipcode = None
|
||||||
|
zip_match = re.search(r'\b(\d{5})\s*$', q)
|
||||||
|
if zip_match:
|
||||||
|
zipcode = zip_match.group(1)
|
||||||
|
q = q[:zip_match.start()].strip().rstrip(',').strip()
|
||||||
|
|
||||||
|
# Strip trailing state
|
||||||
|
tokens = re.split(r'[,\s]+', q)
|
||||||
|
tokens = [t for t in tokens if t]
|
||||||
|
if not tokens:
|
||||||
|
return []
|
||||||
|
|
||||||
|
state = None
|
||||||
|
if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES:
|
||||||
|
state = tokens[-1].upper()
|
||||||
|
tokens = tokens[:-1]
|
||||||
|
|
||||||
|
# Leading digits → number
|
||||||
|
number = None
|
||||||
|
if tokens and re.match(r'^\d', tokens[0]):
|
||||||
|
number = tokens[0]
|
||||||
|
tokens = tokens[1:]
|
||||||
|
|
||||||
|
if not tokens:
|
||||||
|
# Only a number, or empty — try zipcode if we have one
|
||||||
|
if zipcode:
|
||||||
|
return lookup_by_zipcode(zipcode, limit=20)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# If state was found and we have 2+ tokens remaining, last token is city
|
||||||
|
city = None
|
||||||
|
if state and len(tokens) >= 2:
|
||||||
|
city = tokens[-1]
|
||||||
|
tokens = tokens[:-1]
|
||||||
|
|
||||||
|
street = ' '.join(tokens)
|
||||||
|
|
||||||
|
if number:
|
||||||
|
results = lookup_by_street(number, street, city=city, state=state,
|
||||||
|
zipcode=zipcode, country=country_hint)
|
||||||
|
if results:
|
||||||
|
logger.debug("lookup_free_text(%r) → %d results via street match",
|
||||||
|
query, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Fallback: try zipcode only if available
|
||||||
|
if zipcode:
|
||||||
|
return lookup_by_zipcode(zipcode, limit=20)
|
||||||
|
|
||||||
|
logger.debug("lookup_free_text(%r) → 0 results", query)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_by_zipcode(zipcode, limit=100):
|
||||||
|
"""Direct zipcode lookup."""
|
||||||
|
conn = _get_conn()
|
||||||
|
sql = "SELECT * FROM addresses WHERE zipcode = ? LIMIT ?"
|
||||||
|
params = [zipcode.strip(), limit]
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.warning("Netsyms lookup_by_zipcode error: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [_row_to_dict(r) for r in rows]
|
||||||
|
logger.debug("lookup_by_zipcode(%s) → %d results", zipcode, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def health():
|
||||||
|
"""Health check with cached row count."""
|
||||||
|
global _cached_row_count
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_size = os.path.getsize(_DB_PATH)
|
||||||
|
except OSError:
|
||||||
|
return {'ok': False, 'row_count': 0, 'file_size_bytes': 0,
|
||||||
|
'indexed_countries': []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = _get_conn()
|
||||||
|
except Exception:
|
||||||
|
return {'ok': False, 'row_count': 0, 'file_size_bytes': file_size,
|
||||||
|
'indexed_countries': []}
|
||||||
|
|
||||||
|
if _cached_row_count is None:
|
||||||
|
with _lock:
|
||||||
|
if _cached_row_count is None:
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT COUNT(*) AS cnt FROM addresses"
|
||||||
|
).fetchone()
|
||||||
|
_cached_row_count = row['cnt']
|
||||||
|
except sqlite3.Error:
|
||||||
|
_cached_row_count = 0
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT DISTINCT country FROM addresses"
|
||||||
|
).fetchall()
|
||||||
|
countries = sorted(r['country'] for r in rows)
|
||||||
|
except sqlite3.Error:
|
||||||
|
countries = []
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ok': True,
|
||||||
|
'row_count': _cached_row_count,
|
||||||
|
'file_size_bytes': file_size,
|
||||||
|
'indexed_countries': countries,
|
||||||
|
}
|
||||||
31
lib/netsyms_api.py
Normal file
31
lib/netsyms_api.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""
|
||||||
|
RECON Netsyms API — Flask Blueprint.
|
||||||
|
|
||||||
|
GET /api/netsyms/lookup?q=<free text>&country=<optional>
|
||||||
|
GET /api/netsyms/health
|
||||||
|
"""
|
||||||
|
|
||||||
|
from flask import Blueprint, request, jsonify
|
||||||
|
|
||||||
|
from . import netsyms
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.netsyms_api')
|
||||||
|
|
||||||
|
netsyms_bp = Blueprint('netsyms', __name__)
|
||||||
|
|
||||||
|
|
||||||
|
@netsyms_bp.route('/api/netsyms/lookup')
|
||||||
|
def api_netsyms_lookup():
|
||||||
|
q = request.args.get('q', '').strip()
|
||||||
|
if not q:
|
||||||
|
return jsonify({'error': 'Missing q parameter'}), 400
|
||||||
|
|
||||||
|
country = request.args.get('country', '').strip() or None
|
||||||
|
results = netsyms.lookup_free_text(q, country_hint=country)
|
||||||
|
return jsonify({'results': results, 'count': len(results), 'query': q})
|
||||||
|
|
||||||
|
|
||||||
|
@netsyms_bp.route('/api/netsyms/health')
|
||||||
|
def api_netsyms_health():
|
||||||
|
return jsonify(netsyms.health())
|
||||||
80
lib/netsyms_test.py
Normal file
80
lib/netsyms_test.py
Normal file
|
|
@ -0,0 +1,80 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Tests for Netsyms address database module."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Ensure the lib directory is importable
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from lib import netsyms
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_street_lowercase():
|
||||||
|
results = netsyms.lookup_by_street("214", "North St", city="Filer", state="ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}"
|
||||||
|
assert abs(r['lon'] - (-114.6066)) < 0.01, f"Lon mismatch: {r['lon']}"
|
||||||
|
print(" PASS: lookup_by_street (lowercase)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_street_uppercase():
|
||||||
|
results = netsyms.lookup_by_street("214", "NORTH ST", city="FILER", state="ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}"
|
||||||
|
print(" PASS: lookup_by_street (uppercase)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_nonexistent():
|
||||||
|
results = netsyms.lookup_by_street("999999", "Nonexistent Rd",
|
||||||
|
city="Filer", state="ID")
|
||||||
|
assert results == [], f"Expected empty list, got {len(results)} results"
|
||||||
|
print(" PASS: lookup_by_street (nonexistent)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_text_with_commas():
|
||||||
|
results = netsyms.lookup_free_text("214 North St, Filer, ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert r['city'] == 'FILER', f"City mismatch: {r['city']}"
|
||||||
|
assert r['state'] == 'ID', f"State mismatch: {r['state']}"
|
||||||
|
print(" PASS: lookup_free_text (commas)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_text_no_commas():
|
||||||
|
results = netsyms.lookup_free_text("214 North St Filer ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert r['state'] == 'ID', f"State mismatch: {r['state']}"
|
||||||
|
print(" PASS: lookup_free_text (no commas)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_zipcode():
|
||||||
|
results = netsyms.lookup_by_zipcode("83328", limit=5)
|
||||||
|
assert len(results) == 5, f"Expected 5 results, got {len(results)}"
|
||||||
|
for r in results:
|
||||||
|
assert r['zipcode'] == '83328', f"Zipcode mismatch: {r['zipcode']}"
|
||||||
|
print(" PASS: lookup_by_zipcode")
|
||||||
|
|
||||||
|
|
||||||
|
def test_health():
|
||||||
|
h = netsyms.health()
|
||||||
|
assert h['ok'] is True, f"Health not OK: {h}"
|
||||||
|
assert h['row_count'] >= 159_000_000, f"Row count too low: {h['row_count']}"
|
||||||
|
assert 'US' in h['indexed_countries'], f"US not in countries: {h['indexed_countries']}"
|
||||||
|
assert 'CA' in h['indexed_countries'], f"CA not in countries: {h['indexed_countries']}"
|
||||||
|
print(" PASS: health")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print("Running Netsyms tests...")
|
||||||
|
test_lookup_by_street_lowercase()
|
||||||
|
test_lookup_by_street_uppercase()
|
||||||
|
test_lookup_nonexistent()
|
||||||
|
test_free_text_with_commas()
|
||||||
|
test_free_text_no_commas()
|
||||||
|
test_lookup_by_zipcode()
|
||||||
|
test_health()
|
||||||
|
print("All tests passed.")
|
||||||
|
|
@ -77,10 +77,73 @@ def _text_hash(text):
|
||||||
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_table(table_el):
|
||||||
|
"""Convert a <table> element to pipe-delimited text.
|
||||||
|
|
||||||
|
Each <tr> becomes a row with cells joined by ' | '.
|
||||||
|
Returns the formatted table as a string with blank lines around it.
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for tr in table_el.iter('tr'):
|
||||||
|
cells = []
|
||||||
|
for cell in tr:
|
||||||
|
if cell.tag in ('td', 'th'):
|
||||||
|
cell_text = (cell.text_content() or '').strip()
|
||||||
|
# Collapse internal whitespace in each cell
|
||||||
|
cell_text = re.sub(r'\s+', ' ', cell_text)
|
||||||
|
if cell_text:
|
||||||
|
cells.append(cell_text)
|
||||||
|
if cells:
|
||||||
|
rows.append(' | '.join(cells))
|
||||||
|
if not rows:
|
||||||
|
return ''
|
||||||
|
return '\n'.join(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_tree(doc):
|
||||||
|
"""Pre-process HTML tree to add delimiters before text_content() flattens it.
|
||||||
|
|
||||||
|
Handles: <table>, <br>, <li>, <dt>, <dd> -- elements that lxml's
|
||||||
|
text_content() would concatenate without any separators.
|
||||||
|
"""
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# 1. Replace <table> elements with their pipe-delimited text
|
||||||
|
for table in list(doc.iter('table')):
|
||||||
|
formatted = _flatten_table(table)
|
||||||
|
if formatted:
|
||||||
|
replacement = etree.Element('div')
|
||||||
|
replacement.text = '\n\n' + formatted + '\n\n'
|
||||||
|
parent = table.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
parent.replace(table, replacement)
|
||||||
|
else:
|
||||||
|
table.drop_tree()
|
||||||
|
|
||||||
|
# 2. <br> -> inject newline
|
||||||
|
for br in list(doc.iter('br')):
|
||||||
|
br.tail = '\n' + (br.tail or '')
|
||||||
|
|
||||||
|
# 3. <li> -> inject newline + "- " prefix
|
||||||
|
for li in list(doc.iter('li')):
|
||||||
|
li.text = '- ' + (li.text or '')
|
||||||
|
li.tail = '\n' + (li.tail or '')
|
||||||
|
|
||||||
|
# 4. <dt> -> inject newline before
|
||||||
|
for dt in list(doc.iter('dt')):
|
||||||
|
dt.tail = '\n' + (dt.tail or '')
|
||||||
|
|
||||||
|
# 5. <dd> -> inject newline + indent
|
||||||
|
for dd in list(doc.iter('dd')):
|
||||||
|
dd.text = ' ' + (dd.text or '')
|
||||||
|
dd.tail = '\n' + (dd.tail or '')
|
||||||
|
|
||||||
|
|
||||||
def _html_to_text(html_bytes):
|
def _html_to_text(html_bytes):
|
||||||
"""Convert HTML bytes to clean text via lxml.
|
"""Convert HTML bytes to clean text via lxml.
|
||||||
|
|
||||||
Strips nav, footer, script, style elements. Decodes entities.
|
Strips nav, footer, script, style elements. Decodes entities.
|
||||||
|
Pre-processes tables, lists, and line breaks for proper delimiters.
|
||||||
Normalizes whitespace.
|
Normalizes whitespace.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -93,6 +156,9 @@ def _html_to_text(html_bytes):
|
||||||
for el in doc.iter(tag):
|
for el in doc.iter(tag):
|
||||||
el.drop_tree()
|
el.drop_tree()
|
||||||
|
|
||||||
|
# Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes
|
||||||
|
_preprocess_tree(doc)
|
||||||
|
|
||||||
# Extract text
|
# Extract text
|
||||||
text = doc.text_content()
|
text = doc.text_content()
|
||||||
|
|
||||||
|
|
|
||||||
161
lib/query_router.py
Normal file
161
lib/query_router.py
Normal file
|
|
@ -0,0 +1,161 @@
|
||||||
|
"""Semantic query router for Aurora.
|
||||||
|
|
||||||
|
Classifies user queries into routes (nav_route, nav_reverse_geocode,
|
||||||
|
direct_answer, rag_search) by comparing query embeddings against
|
||||||
|
pre-computed route centroids from example queries.
|
||||||
|
|
||||||
|
TEI endpoint: http://100.64.0.14:8090/embed (cortex via Tailscale)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# ── Route examples ────────────────────────────────────────────────────────────
|
||||||
|
ROUTE_EXAMPLES = {
|
||||||
|
"nav_route": [
|
||||||
|
"how do I get to Boise",
|
||||||
|
"directions to Twin Falls",
|
||||||
|
"how do I get from Buhl to Boise",
|
||||||
|
"drive from Jerome to Sun Valley",
|
||||||
|
"route from Boise to McCall",
|
||||||
|
"what's the fastest way to Sun Valley",
|
||||||
|
"how far is it to Twin Falls",
|
||||||
|
"take me to Shoshone",
|
||||||
|
"navigate to the airport",
|
||||||
|
"how do I drive to Salt Lake City",
|
||||||
|
"walking directions to the park",
|
||||||
|
"bike route to downtown",
|
||||||
|
],
|
||||||
|
"nav_reverse_geocode": [
|
||||||
|
"what town is at 42.5, -114.7",
|
||||||
|
"where am I right now",
|
||||||
|
"what is at coordinates 43.6, -116.2",
|
||||||
|
"what location is 42.574, -114.607",
|
||||||
|
"where is this place 44.0, -114.3",
|
||||||
|
"what city is near 42.7, -114.5",
|
||||||
|
"reverse geocode 43.0, -115.0",
|
||||||
|
"what's at this location 42.9, -114.8",
|
||||||
|
],
|
||||||
|
"direct_answer": [
|
||||||
|
"hello",
|
||||||
|
"hey aurora",
|
||||||
|
"good morning",
|
||||||
|
"thanks",
|
||||||
|
"thank you",
|
||||||
|
"what's your name",
|
||||||
|
"who are you",
|
||||||
|
"tell me a joke",
|
||||||
|
"how are you",
|
||||||
|
"hi there",
|
||||||
|
],
|
||||||
|
"rag_search": [
|
||||||
|
"what does the survival manual say about water",
|
||||||
|
"how to purify water in the field",
|
||||||
|
"how to treat a gunshot wound",
|
||||||
|
"what is the ranger handbook chapter on patrolling",
|
||||||
|
"field manual water purification",
|
||||||
|
"how to build a shelter in the wilderness",
|
||||||
|
"tactical combat casualty care procedures",
|
||||||
|
"what does FM 21-76 say about fire starting",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Module-level cache ────────────────────────────────────────────────────────
|
||||||
|
_ROUTE_CENTROIDS: dict | None = None
|
||||||
|
_LOCK = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _embed_batch(texts: list[str], tei_url: str) -> list[list[float]]:
|
||||||
|
"""Embed a batch of texts via TEI."""
|
||||||
|
resp = requests.post(tei_url, json={"inputs": texts}, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_centroid(vectors: list[list[float]]) -> list[float]:
|
||||||
|
"""Element-wise mean of vectors."""
|
||||||
|
n = len(vectors)
|
||||||
|
dim = len(vectors[0])
|
||||||
|
centroid = [0.0] * dim
|
||||||
|
for vec in vectors:
|
||||||
|
for i in range(dim):
|
||||||
|
centroid[i] += vec[i]
|
||||||
|
for i in range(dim):
|
||||||
|
centroid[i] /= n
|
||||||
|
return centroid
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||||
|
"""Cosine similarity between two vectors (pure Python)."""
|
||||||
|
dot = 0.0
|
||||||
|
norm_a = 0.0
|
||||||
|
norm_b = 0.0
|
||||||
|
for i in range(len(a)):
|
||||||
|
dot += a[i] * b[i]
|
||||||
|
norm_a += a[i] * a[i]
|
||||||
|
norm_b += b[i] * b[i]
|
||||||
|
denom = math.sqrt(norm_a) * math.sqrt(norm_b)
|
||||||
|
if denom == 0:
|
||||||
|
return 0.0
|
||||||
|
return dot / denom
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_centroids(tei_url: str) -> dict[str, list[float]]:
|
||||||
|
"""Lazy-init: embed all examples in one batch, compute centroids, cache."""
|
||||||
|
global _ROUTE_CENTROIDS
|
||||||
|
if _ROUTE_CENTROIDS is not None:
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
with _LOCK:
|
||||||
|
if _ROUTE_CENTROIDS is not None:
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
# Flatten all examples into one batch
|
||||||
|
all_texts = []
|
||||||
|
route_ranges: dict[str, tuple[int, int]] = {}
|
||||||
|
offset = 0
|
||||||
|
for route, examples in ROUTE_EXAMPLES.items():
|
||||||
|
route_ranges[route] = (offset, offset + len(examples))
|
||||||
|
all_texts.extend(examples)
|
||||||
|
offset += len(examples)
|
||||||
|
|
||||||
|
all_vectors = _embed_batch(all_texts, tei_url)
|
||||||
|
|
||||||
|
centroids = {}
|
||||||
|
for route, (start, end) in route_ranges.items():
|
||||||
|
centroids[route] = _compute_centroid(all_vectors[start:end])
|
||||||
|
|
||||||
|
_ROUTE_CENTROIDS = centroids
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
|
||||||
|
def classify(
|
||||||
|
query: str,
|
||||||
|
tei_url: str = "http://100.64.0.14:8090/embed",
|
||||||
|
threshold: float = 0.45,
|
||||||
|
) -> tuple[str, float]:
|
||||||
|
"""Classify a query into a route.
|
||||||
|
|
||||||
|
Returns (route_name, confidence). If no route exceeds the threshold,
|
||||||
|
returns ("rag_search", best_score) as the safe default.
|
||||||
|
"""
|
||||||
|
centroids = _ensure_centroids(tei_url)
|
||||||
|
|
||||||
|
# Embed the query
|
||||||
|
vecs = _embed_batch([query], tei_url)
|
||||||
|
query_vec = vecs[0]
|
||||||
|
|
||||||
|
# Compare against all centroids
|
||||||
|
best_route = "rag_search"
|
||||||
|
best_score = 0.0
|
||||||
|
for route, centroid in centroids.items():
|
||||||
|
sim = _cosine_similarity(query_vec, centroid)
|
||||||
|
if sim > best_score:
|
||||||
|
best_score = sim
|
||||||
|
best_route = route
|
||||||
|
|
||||||
|
if best_score < threshold:
|
||||||
|
return ("rag_search", best_score)
|
||||||
|
|
||||||
|
return (best_route, best_score)
|
||||||
49
lib/query_router_test.py
Normal file
49
lib/query_router_test.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test suite for the semantic query router."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from lib.query_router import classify
|
||||||
|
|
||||||
|
TEST_QUERIES = [
|
||||||
|
("how do I get from Buhl to Boise", "nav_route"),
|
||||||
|
("what does the survival manual say about water", "rag_search"),
|
||||||
|
("what town is at 42.5, -114.7", "nav_reverse_geocode"),
|
||||||
|
("hey aurora", "direct_answer"),
|
||||||
|
("what's the fastest way to Sun Valley", "nav_route"),
|
||||||
|
("how to purify water in the field", "rag_search"),
|
||||||
|
("good morning", "direct_answer"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Query Router Test Suite")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for query, expected in TEST_QUERIES:
|
||||||
|
route, confidence = classify(query)
|
||||||
|
status = "PASS" if route == expected else "FAIL"
|
||||||
|
if status == "PASS":
|
||||||
|
passed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
print(f" [{status}] {query!r}")
|
||||||
|
print(f" → {route} ({confidence:.3f}) expected={expected}")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Results: {passed}/{passed + failed} passed")
|
||||||
|
if failed:
|
||||||
|
print(f" {failed} FAILED")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(" All tests passed!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -3,6 +3,7 @@ anyio==4.12.1
|
||||||
babel==2.18.0
|
babel==2.18.0
|
||||||
beautifulsoup4==4.14.3
|
beautifulsoup4==4.14.3
|
||||||
blinker==1.9.0
|
blinker==1.9.0
|
||||||
|
cachetools==7.1.3
|
||||||
certifi==2026.1.4
|
certifi==2026.1.4
|
||||||
cffi==2.0.0
|
cffi==2.0.0
|
||||||
charset-normalizer==3.4.4
|
charset-normalizer==3.4.4
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue