mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
feat(navi): structured geocode with usaddress parsing and reranker
Add lib/geocode.py — multi-source retrieval pipeline: - usaddress CRF parsing with intent classification - Netsyms structured lookup (uses raw street abbreviations) - Photon /structured + /api freetext retrieval - Weighted 10-signal reranker (housenumber, street fuzz, locality, source authority, etc.) - match_code annotations + address book proximity labeling - Trace log at /tmp/geocode_rerank_trace.log nav_tools.py now delegates geocode() to the new module. Tests updated: US address queries correctly return Netsyms results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c76d63b785
commit
87b230dcba
3 changed files with 721 additions and 197 deletions
708
lib/geocode.py
Normal file
708
lib/geocode.py
Normal file
|
|
@ -0,0 +1,708 @@
|
|||
"""
|
||||
RECON geocode — structured preprocessing, multi-source retrieval, reranking.
|
||||
|
||||
Replaces the naive Photon-only search with:
|
||||
1. usaddress parsing + intent classification (ADDRESS / POI / LOCALITY / COORD / POSTCODE)
|
||||
2. Multi-source retrieval: ADDRESS → Netsyms + Photon; POI/LOCALITY → Photon /api
|
||||
3. Python reranker with weighted signals
|
||||
|
||||
Public entry point: geocode(query, limit) → {query, results, count}
|
||||
"""
|
||||
|
||||
import math
|
||||
import re
|
||||
import logging
|
||||
|
||||
import requests
|
||||
import usaddress
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
from .utils import setup_logging
|
||||
|
||||
logger = setup_logging('recon.geocode')
|
||||
|
||||
# ── Trace logger for reranking audit ──
|
||||
_trace_logger = logging.getLogger('recon.geocode.trace')
|
||||
_trace_handler = logging.FileHandler('/tmp/geocode_rerank_trace.log')
|
||||
_trace_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
|
||||
_trace_logger.addHandler(_trace_handler)
|
||||
_trace_logger.setLevel(logging.DEBUG)
|
||||
|
||||
# ── Config constants ──
|
||||
PHOTON_URL = "http://localhost:2322"
|
||||
GEOCODE_BIAS_LAT = 42.5736
|
||||
GEOCODE_BIAS_LON = -114.6066
|
||||
GEOCODE_BIAS_ZOOM = 10
|
||||
ADDRESS_BOOK_ANNOTATION_RADIUS_M = 75
|
||||
|
||||
# ── Reranker weights ──
|
||||
# Derived from research analysis of failure modes:
|
||||
# housenumber_exact is the strongest signal because Photon's soft-boost
|
||||
# lets wrong-number results bubble up. street_name_fuzz and locality_fuzz
|
||||
# handle abbreviation/case variation. source_authority gives Netsyms a
|
||||
# boost for US addresses since it has USPS-verified data.
|
||||
W_HOUSENUMBER_EXACT = 6.0 # exact housenumber match
|
||||
W_HOUSENUMBER_MISMATCH = -5.0 # housenumber present but wrong
|
||||
W_STREET_NAME_FUZZ = 3.0 # fuzzy street name similarity [0..1] * weight
|
||||
W_TOKEN_COVERAGE = 2.0 # fraction of query tokens found in result
|
||||
W_STREET_TYPE_MATCH = 1.5 # "st" matches "street", etc.
|
||||
W_LOCALITY_FUZZ = 2.0 # city/state fuzzy match
|
||||
W_SOURCE_AUTHORITY = 2.0 # Netsyms for US addresses
|
||||
W_LAYER_RANK = 1.0 # type-appropriate results ranked higher
|
||||
W_PHOTON_POSITION_NORM = 1.0 # Photon's native ranking (normalized by position)
|
||||
W_STATE_EXACT = 1.0 # exact state code match
|
||||
|
||||
# ── US abbreviation expansions ──
|
||||
# Applied ONLY to parsed StreetName/StreetNamePostType tokens, NOT to ordinals.
|
||||
_STREET_TYPE_ABBREVS = {
|
||||
'st': 'street', 'ave': 'avenue', 'blvd': 'boulevard', 'dr': 'drive',
|
||||
'rd': 'road', 'ln': 'lane', 'ct': 'court', 'cir': 'circle',
|
||||
'pl': 'place', 'way': 'way', 'pkwy': 'parkway', 'hwy': 'highway',
|
||||
'trl': 'trail', 'ter': 'terrace', 'sq': 'square',
|
||||
}
|
||||
_DIRECTIONAL_ABBREVS = {
|
||||
'n': 'north', 's': 'south', 'e': 'east', 'w': 'west',
|
||||
'ne': 'northeast', 'nw': 'northwest', 'se': 'southeast', 'sw': 'southwest',
|
||||
}
|
||||
_ORDINAL_RE = re.compile(r'^\d+(st|nd|rd|th)$', re.IGNORECASE)
|
||||
|
||||
# ── US state codes ──
|
||||
_STATE_CODES = {
|
||||
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
|
||||
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
|
||||
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
|
||||
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
|
||||
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC',
|
||||
}
|
||||
|
||||
# Coordinate regex
|
||||
_COORD_RE = re.compile(r'^\s*(-?\d+\.?\d*)\s*[,\s]\s*(-?\d+\.?\d*)\s*$')
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STEP 1: PREPROCESSING
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
def _parse_coords(text):
|
||||
"""Return (lat, lon) if text looks like coordinates with valid bounds, else None."""
|
||||
m = _COORD_RE.match(text.strip())
|
||||
if not m:
|
||||
return None
|
||||
lat, lon = float(m.group(1)), float(m.group(2))
|
||||
if -90 <= lat <= 90 and -180 <= lon <= 180:
|
||||
return lat, lon
|
||||
return None
|
||||
|
||||
|
||||
def _classify_and_parse(query):
|
||||
"""
|
||||
Parse query with usaddress, classify intent, expand abbreviations.
|
||||
|
||||
Returns (intent, parsed_dict) where:
|
||||
intent: 'ADDRESS' | 'POI' | 'LOCALITY' | 'POSTCODE' | 'COORD' | 'UNKNOWN'
|
||||
parsed_dict: {number, street, city, state, zipcode, raw_query, expanded_query}
|
||||
"""
|
||||
q = query.strip()
|
||||
parsed = {
|
||||
'number': None, 'street': None, 'street_raw': None,
|
||||
'city': None, 'state': None,
|
||||
'zipcode': None, 'raw_query': q, 'expanded_query': q,
|
||||
}
|
||||
|
||||
# Coordinate check first
|
||||
if _parse_coords(q):
|
||||
return 'COORD', parsed
|
||||
|
||||
# Try usaddress
|
||||
try:
|
||||
tagged, addr_type = usaddress.tag(q)
|
||||
except usaddress.RepeatedLabelError:
|
||||
# Ambiguous input — fall back to free-text Photon
|
||||
return 'UNKNOWN', parsed
|
||||
|
||||
# Extract components
|
||||
number = tagged.get('AddressNumber', '').strip()
|
||||
street_name = tagged.get('StreetName', '').strip()
|
||||
street_pre_dir = tagged.get('StreetNamePreDirectional', '').strip()
|
||||
street_post_type = tagged.get('StreetNamePostType', '').strip()
|
||||
place = tagged.get('PlaceName', '').strip()
|
||||
state = tagged.get('StateName', '').strip()
|
||||
zipcode = tagged.get('ZipCode', '').strip()
|
||||
|
||||
# ── Fix usaddress edge case: "214 N St Filer" ──
|
||||
# usaddress reads single-letter directional + "St" as PreDirectional + empty,
|
||||
# mashing "St Filer" into StreetName. Detect: PreDirectional is single letter,
|
||||
# StreetName has 2+ tokens where the first is a street type.
|
||||
if (street_pre_dir and len(street_pre_dir) <= 2
|
||||
and not street_name.strip().startswith(street_pre_dir)
|
||||
and ' ' in street_name):
|
||||
name_tokens = street_name.split()
|
||||
first_lower = name_tokens[0].lower()
|
||||
if first_lower in _STREET_TYPE_ABBREVS or first_lower in _STREET_TYPE_ABBREVS.values():
|
||||
# "N" is actually the street name, "St" is the post-type
|
||||
street_name = street_pre_dir
|
||||
street_post_type = name_tokens[0]
|
||||
if len(name_tokens) > 1:
|
||||
place = ' '.join(name_tokens[1:])
|
||||
street_pre_dir = ''
|
||||
|
||||
# ── Expand abbreviations (guard ordinals) ──
|
||||
expanded_parts = []
|
||||
|
||||
if number:
|
||||
parsed['number'] = number
|
||||
expanded_parts.append(number)
|
||||
|
||||
if street_pre_dir:
|
||||
exp = _DIRECTIONAL_ABBREVS.get(street_pre_dir.lower(), street_pre_dir)
|
||||
expanded_parts.append(exp)
|
||||
|
||||
if street_name:
|
||||
# Don't expand ordinals: "21st" stays "21st"
|
||||
if _ORDINAL_RE.match(street_name):
|
||||
expanded_parts.append(street_name)
|
||||
else:
|
||||
# Expand directional abbreviation if it IS the street name
|
||||
exp = _DIRECTIONAL_ABBREVS.get(street_name.lower(), street_name)
|
||||
expanded_parts.append(exp)
|
||||
parsed['street'] = street_name
|
||||
|
||||
if street_post_type:
|
||||
if _ORDINAL_RE.match(street_post_type):
|
||||
expanded_parts.append(street_post_type)
|
||||
else:
|
||||
exp = _STREET_TYPE_ABBREVS.get(street_post_type.lower(), street_post_type)
|
||||
expanded_parts.append(exp)
|
||||
|
||||
# Build raw street (original abbreviations, for Netsyms) and expanded (for Photon)
|
||||
raw_street_parts = []
|
||||
if street_pre_dir:
|
||||
raw_street_parts.append(street_pre_dir)
|
||||
if street_name:
|
||||
raw_street_parts.append(street_name)
|
||||
if street_post_type:
|
||||
raw_street_parts.append(street_post_type)
|
||||
parsed['street_raw'] = ' '.join(raw_street_parts)
|
||||
|
||||
# Build the full expanded street
|
||||
if expanded_parts:
|
||||
# The street is everything after the number
|
||||
street_full = ' '.join(expanded_parts[1:] if number else expanded_parts)
|
||||
parsed['street'] = street_full
|
||||
|
||||
if place:
|
||||
parsed['city'] = place
|
||||
expanded_parts.append(place)
|
||||
if state:
|
||||
parsed['state'] = state.upper()
|
||||
expanded_parts.append(state)
|
||||
if zipcode:
|
||||
parsed['zipcode'] = zipcode
|
||||
expanded_parts.append(zipcode)
|
||||
|
||||
parsed['expanded_query'] = ' '.join(expanded_parts)
|
||||
|
||||
# ── Intent classification ──
|
||||
if addr_type == 'Street Address' and number:
|
||||
return 'ADDRESS', parsed
|
||||
elif zipcode and not number and not street_name:
|
||||
return 'POSTCODE', parsed
|
||||
elif addr_type == 'Ambiguous':
|
||||
# Check if it looks like a locality: 2 tokens, second is a state code
|
||||
tokens = q.replace(',', ' ').split()
|
||||
if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES:
|
||||
parsed['city'] = ' '.join(tokens[:-1])
|
||||
parsed['state'] = tokens[-1].upper()
|
||||
return 'LOCALITY', parsed
|
||||
return 'UNKNOWN', parsed
|
||||
else:
|
||||
return 'UNKNOWN', parsed
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STEP 2: RETRIEVAL
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
def _retrieve_netsyms(parsed, limit=10):
|
||||
"""Query Netsyms for structured address lookup. Returns list of candidate dicts."""
|
||||
try:
|
||||
from . import netsyms
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
results = []
|
||||
number = parsed.get('number', '')
|
||||
street = parsed.get('street_raw') or parsed.get('street', '')
|
||||
city = parsed.get('city', '')
|
||||
state = parsed.get('state', '')
|
||||
zipcode = parsed.get('zipcode', '')
|
||||
|
||||
if number and street:
|
||||
rows = netsyms.lookup_by_street(
|
||||
number, street, city=city, state=state, zipcode=zipcode, limit=limit
|
||||
)
|
||||
elif zipcode:
|
||||
rows = netsyms.lookup_by_zipcode(zipcode, limit=limit)
|
||||
else:
|
||||
return []
|
||||
|
||||
for row in rows:
|
||||
addr_parts = [row['number'], row['street']]
|
||||
if row.get('street2'):
|
||||
addr_parts.append(row['street2'])
|
||||
addr_parts.extend([row['city'], row['state'], row['zipcode']])
|
||||
display = ' '.join(p for p in addr_parts if p)
|
||||
results.append({
|
||||
'name': display,
|
||||
'lat': row['lat'],
|
||||
'lon': row['lon'],
|
||||
'source': 'netsyms',
|
||||
'type': 'street_address',
|
||||
'raw': row,
|
||||
'_number': row.get('number', ''),
|
||||
'_street': row.get('street', ''),
|
||||
'_city': row.get('city', ''),
|
||||
'_state': row.get('state', ''),
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def _retrieve_photon_structured(parsed, limit=10):
|
||||
"""Query Photon /structured endpoint for address lookup."""
|
||||
params = {'limit': limit, 'countrycode': 'US'}
|
||||
if parsed.get('street'):
|
||||
params['street'] = parsed['street']
|
||||
if parsed.get('number'):
|
||||
params['housenumber'] = parsed['number']
|
||||
if parsed.get('city'):
|
||||
params['city'] = parsed['city']
|
||||
if parsed.get('state'):
|
||||
params['state'] = parsed['state']
|
||||
|
||||
if 'street' not in params:
|
||||
return []
|
||||
|
||||
try:
|
||||
resp = requests.get(f"{PHOTON_URL}/structured", params=params, timeout=5)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.debug("Photon /structured failed: %s", e)
|
||||
return []
|
||||
|
||||
return _parse_photon_features(data.get('features', []), 'photon')
|
||||
|
||||
|
||||
def _retrieve_photon_freetext(query, limit=10):
|
||||
"""Query Photon /api for free-text search with location bias."""
|
||||
try:
|
||||
params = {
|
||||
'q': query,
|
||||
'limit': limit,
|
||||
'lat': GEOCODE_BIAS_LAT,
|
||||
'lon': GEOCODE_BIAS_LON,
|
||||
'zoom': GEOCODE_BIAS_ZOOM,
|
||||
}
|
||||
resp = requests.get(f"{PHOTON_URL}/api", params=params, timeout=5)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.debug("Photon /api failed: %s", e)
|
||||
return []
|
||||
|
||||
return _parse_photon_features(data.get('features', []), 'photon')
|
||||
|
||||
|
||||
def _parse_photon_features(features, source):
|
||||
"""Convert Photon GeoJSON features to candidate dicts."""
|
||||
results = []
|
||||
for i, feature in enumerate(features):
|
||||
props = feature.get('properties', {})
|
||||
coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
||||
|
||||
osm_key = props.get('osm_key', '')
|
||||
osm_value = props.get('osm_value', '')
|
||||
feat_type = props.get('type', '')
|
||||
has_hn = bool(props.get('housenumber'))
|
||||
|
||||
if has_hn or osm_value in ('house', 'residential'):
|
||||
rtype = 'street_address'
|
||||
elif feat_type in ('city', 'town', 'village', 'hamlet', 'county', 'state', 'country'):
|
||||
rtype = 'locality'
|
||||
elif osm_key in ('amenity', 'shop', 'tourism', 'leisure'):
|
||||
rtype = 'poi'
|
||||
else:
|
||||
rtype = 'poi'
|
||||
|
||||
# Build display name
|
||||
parts = []
|
||||
hn = props.get('housenumber')
|
||||
street = props.get('street')
|
||||
name = props.get('name', '')
|
||||
if hn and street:
|
||||
parts.append(f"{hn} {street}")
|
||||
if name and name != street:
|
||||
parts.append(name)
|
||||
elif name:
|
||||
parts.append(name)
|
||||
elif street:
|
||||
parts.append(street)
|
||||
for key in ('city', 'county', 'state', 'country'):
|
||||
v = props.get(key)
|
||||
if v and (not parts or v != parts[-1]):
|
||||
parts.append(v)
|
||||
display = ', '.join(p for p in parts if p) or 'Unknown'
|
||||
|
||||
results.append({
|
||||
'name': display,
|
||||
'lat': coords[1],
|
||||
'lon': coords[0],
|
||||
'source': source,
|
||||
'type': rtype,
|
||||
'raw': props,
|
||||
'_photon_rank': i,
|
||||
'_number': props.get('housenumber', ''),
|
||||
'_street': props.get('street', ''),
|
||||
'_city': props.get('city', ''),
|
||||
'_state': props.get('state', ''),
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STEP 3: RERANKER
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
def _expand_street_type(s):
|
||||
"""Expand a street type abbreviation for comparison."""
|
||||
return _STREET_TYPE_ABBREVS.get(s.lower(), s.lower())
|
||||
|
||||
|
||||
def _score_candidate(candidate, parsed, intent):
|
||||
"""
|
||||
Score a candidate against the parsed query.
|
||||
Returns (total_score, signal_breakdown_dict).
|
||||
"""
|
||||
signals = {}
|
||||
total = 0.0
|
||||
|
||||
query_number = (parsed.get('number') or '').strip().upper()
|
||||
query_street = (parsed.get('street') or '').strip().upper()
|
||||
query_city = (parsed.get('city') or '').strip().upper()
|
||||
query_state = (parsed.get('state') or '').strip().upper()
|
||||
|
||||
cand_number = (candidate.get('_number') or '').strip().upper()
|
||||
cand_street = (candidate.get('_street') or '').strip().upper()
|
||||
cand_city = (candidate.get('_city') or '').strip().upper()
|
||||
cand_state = (candidate.get('_state') or '').strip().upper()
|
||||
|
||||
# ── Housenumber ──
|
||||
if intent == 'ADDRESS' and query_number:
|
||||
if cand_number == query_number:
|
||||
signals['housenumber_exact'] = W_HOUSENUMBER_EXACT
|
||||
total += W_HOUSENUMBER_EXACT
|
||||
elif cand_number and cand_number != query_number:
|
||||
signals['housenumber_mismatch'] = W_HOUSENUMBER_MISMATCH
|
||||
total += W_HOUSENUMBER_MISMATCH
|
||||
|
||||
# ── Street name fuzz ──
|
||||
if query_street and cand_street:
|
||||
# Expand both for comparison
|
||||
q_expanded = ' '.join(_expand_street_type(t) for t in query_street.split())
|
||||
c_expanded = ' '.join(_expand_street_type(t) for t in cand_street.split())
|
||||
ratio = fuzz.token_sort_ratio(q_expanded, c_expanded) / 100.0
|
||||
score = ratio * W_STREET_NAME_FUZZ
|
||||
signals['street_name_fuzz'] = round(score, 2)
|
||||
total += score
|
||||
|
||||
# ── Street type match ──
|
||||
if query_street and cand_street:
|
||||
q_tokens = set(_expand_street_type(t) for t in query_street.split())
|
||||
c_tokens = set(_expand_street_type(t) for t in cand_street.split())
|
||||
# Check if the street type words overlap
|
||||
street_types = set(_STREET_TYPE_ABBREVS.values())
|
||||
q_types = q_tokens & street_types
|
||||
c_types = c_tokens & street_types
|
||||
if q_types and q_types & c_types:
|
||||
signals['street_type_match'] = W_STREET_TYPE_MATCH
|
||||
total += W_STREET_TYPE_MATCH
|
||||
|
||||
# ── Token coverage ──
|
||||
raw_q = parsed.get('raw_query', '').upper()
|
||||
q_tokens = set(raw_q.replace(',', ' ').split())
|
||||
if q_tokens:
|
||||
cand_text = candidate.get('name', '').upper()
|
||||
matched = sum(1 for t in q_tokens if t in cand_text)
|
||||
coverage = matched / len(q_tokens)
|
||||
score = coverage * W_TOKEN_COVERAGE
|
||||
signals['token_coverage'] = round(score, 2)
|
||||
total += score
|
||||
|
||||
# ── Locality fuzz ──
|
||||
if query_city and cand_city:
|
||||
ratio = fuzz.ratio(query_city, cand_city) / 100.0
|
||||
score = ratio * W_LOCALITY_FUZZ
|
||||
signals['locality_fuzz'] = round(score, 2)
|
||||
total += score
|
||||
|
||||
# ── State exact ──
|
||||
if query_state and cand_state:
|
||||
if cand_state == query_state:
|
||||
signals['state_exact'] = W_STATE_EXACT
|
||||
total += W_STATE_EXACT
|
||||
|
||||
# ── Source authority ──
|
||||
if candidate.get('source') == 'netsyms' and intent == 'ADDRESS':
|
||||
signals['source_authority'] = W_SOURCE_AUTHORITY
|
||||
total += W_SOURCE_AUTHORITY
|
||||
|
||||
# ── Layer rank (type-appropriate bonus) ──
|
||||
cand_type = candidate.get('type', '')
|
||||
if intent == 'ADDRESS' and cand_type == 'street_address':
|
||||
signals['layer_rank'] = W_LAYER_RANK
|
||||
total += W_LAYER_RANK
|
||||
elif intent == 'LOCALITY' and cand_type == 'locality':
|
||||
signals['layer_rank'] = W_LAYER_RANK
|
||||
total += W_LAYER_RANK
|
||||
elif intent == 'POI' and cand_type == 'poi':
|
||||
signals['layer_rank'] = W_LAYER_RANK
|
||||
total += W_LAYER_RANK
|
||||
|
||||
# ── Photon position normalization ──
|
||||
photon_rank = candidate.get('_photon_rank')
|
||||
if photon_rank is not None:
|
||||
# Top result gets full bonus, decays linearly
|
||||
score = max(0, (1.0 - photon_rank / 10.0)) * W_PHOTON_POSITION_NORM
|
||||
signals['photon_position'] = round(score, 2)
|
||||
total += score
|
||||
|
||||
return round(total, 2), signals
|
||||
|
||||
|
||||
def _build_match_code(candidate, parsed, intent):
|
||||
"""Build a match_code dict indicating match quality for each field."""
|
||||
mc = {}
|
||||
if intent == 'ADDRESS':
|
||||
q_num = (parsed.get('number') or '').strip().upper()
|
||||
c_num = (candidate.get('_number') or '').strip().upper()
|
||||
if q_num and c_num == q_num:
|
||||
mc['housenumber'] = 'matched'
|
||||
elif q_num and c_num:
|
||||
mc['housenumber'] = 'unmatched'
|
||||
elif q_num and not c_num:
|
||||
mc['housenumber'] = 'inferred'
|
||||
|
||||
q_street = (parsed.get('street') or '').strip().upper()
|
||||
c_street = (candidate.get('_street') or '').strip().upper()
|
||||
if q_street and c_street:
|
||||
q_exp = ' '.join(_expand_street_type(t) for t in q_street.split())
|
||||
c_exp = ' '.join(_expand_street_type(t) for t in c_street.split())
|
||||
ratio = fuzz.token_sort_ratio(q_exp, c_exp) / 100.0
|
||||
mc['street'] = 'matched' if ratio > 0.8 else 'unmatched'
|
||||
elif q_street:
|
||||
mc['street'] = 'inferred'
|
||||
|
||||
q_city = (parsed.get('city') or '').strip().upper()
|
||||
c_city = (candidate.get('_city') or '').strip().upper()
|
||||
if q_city and c_city:
|
||||
ratio = fuzz.ratio(q_city, c_city) / 100.0
|
||||
mc['city'] = 'matched' if ratio > 0.8 else 'unmatched'
|
||||
elif q_city:
|
||||
mc['city'] = 'inferred'
|
||||
|
||||
return mc
|
||||
|
||||
|
||||
def _rerank(candidates, parsed, intent, query, limit):
|
||||
"""Score, sort, and trim candidates. Trace-log top 3."""
|
||||
scored = []
|
||||
for c in candidates:
|
||||
total, signals = _score_candidate(c, parsed, intent)
|
||||
c['_score'] = total
|
||||
c['_signals'] = signals
|
||||
scored.append(c)
|
||||
|
||||
scored.sort(key=lambda c: c['_score'], reverse=True)
|
||||
|
||||
# Trace log for audit
|
||||
_trace_logger.debug("─── Query: %r intent=%s ───", query, intent)
|
||||
for i, c in enumerate(scored[:3]):
|
||||
_trace_logger.debug(
|
||||
" #%d score=%.2f src=%s name=%s",
|
||||
i, c['_score'], c.get('source', '?'), c.get('name', '?')[:60]
|
||||
)
|
||||
_trace_logger.debug(" signals=%s", c.get('_signals', {}))
|
||||
|
||||
# Clean internal fields and add match_code
|
||||
result = []
|
||||
for c in scored[:limit]:
|
||||
mc = _build_match_code(c, parsed, intent)
|
||||
|
||||
# Assign confidence from score
|
||||
score = c.get('_score', 0)
|
||||
if score >= 10:
|
||||
confidence = 'exact'
|
||||
elif score >= 5:
|
||||
confidence = 'high'
|
||||
elif score >= 2:
|
||||
confidence = 'medium'
|
||||
else:
|
||||
confidence = 'low'
|
||||
|
||||
entry = {
|
||||
'name': c['name'],
|
||||
'lat': c['lat'],
|
||||
'lon': c['lon'],
|
||||
'source': c['source'],
|
||||
'confidence': confidence,
|
||||
'type': c.get('type', 'poi'),
|
||||
'raw': c.get('raw'),
|
||||
}
|
||||
if mc:
|
||||
entry['match_code'] = mc
|
||||
result.append(entry)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# STEP 4: ANNOTATION
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
def _haversine_m(lat1, lon1, lat2, lon2):
|
||||
"""Haversine distance in meters."""
|
||||
R = 6_371_000
|
||||
rlat1, rlat2 = math.radians(lat1), math.radians(lat2)
|
||||
dlat = math.radians(lat2 - lat1)
|
||||
dlon = math.radians(lon2 - lon1)
|
||||
a = math.sin(dlat / 2) ** 2 + math.cos(rlat1) * math.cos(rlat2) * math.sin(dlon / 2) ** 2
|
||||
return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
||||
|
||||
|
||||
def _annotate_with_address_book(results):
|
||||
"""Add labeled_as to results within radius of an address book entry."""
|
||||
try:
|
||||
from . import address_book
|
||||
entries = address_book.load()
|
||||
except Exception:
|
||||
return
|
||||
for result in results:
|
||||
rlat, rlon = result.get('lat'), result.get('lon')
|
||||
if rlat is None or rlon is None:
|
||||
continue
|
||||
for entry in entries:
|
||||
elat, elon = entry.get('lat'), entry.get('lon')
|
||||
if elat is None or elon is None:
|
||||
continue
|
||||
if _haversine_m(rlat, rlon, elat, elon) <= ADDRESS_BOOK_ANNOTATION_RADIUS_M:
|
||||
result['labeled_as'] = entry['name']
|
||||
break
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# PUBLIC API
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
def geocode(query, limit=10):
|
||||
"""
|
||||
Structured geocoding with multi-source retrieval and reranking.
|
||||
|
||||
Returns {query, results: [...], count} — always 200-safe.
|
||||
"""
|
||||
limit = max(1, min(limit, 20))
|
||||
q = (query or '').strip()
|
||||
empty = {'query': q, 'results': [], 'count': 0}
|
||||
|
||||
if not q:
|
||||
return empty
|
||||
|
||||
# ── Coordinate detection ──
|
||||
coords = _parse_coords(q)
|
||||
if coords:
|
||||
return {
|
||||
'query': q,
|
||||
'results': [{
|
||||
'name': q,
|
||||
'lat': coords[0],
|
||||
'lon': coords[1],
|
||||
'source': 'coordinates',
|
||||
'confidence': 'exact',
|
||||
'type': 'coordinates',
|
||||
'raw': None,
|
||||
}],
|
||||
'count': 1,
|
||||
}
|
||||
|
||||
# ── Address book nickname short-circuit ──
|
||||
normalized_q = ' '.join(q.lower().replace(',', ' ').split())
|
||||
is_single_word = ' ' not in normalized_q
|
||||
try:
|
||||
from . import address_book
|
||||
ab_match = address_book.lookup(q)
|
||||
if (ab_match
|
||||
and ab_match['confidence'] == 'exact'
|
||||
and ab_match.get('lat') and ab_match.get('lon')
|
||||
and is_single_word):
|
||||
logger.info("geocode: nickname short-circuit %r → %s", q, ab_match['name'])
|
||||
return {
|
||||
'query': q,
|
||||
'results': [{
|
||||
'name': ab_match.get('address') or ab_match['name'],
|
||||
'lat': ab_match['lat'],
|
||||
'lon': ab_match['lon'],
|
||||
'source': 'address_book',
|
||||
'confidence': 'exact',
|
||||
'type': 'nickname',
|
||||
'raw': ab_match,
|
||||
}],
|
||||
'count': 1,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug("geocode: address_book lookup failed: %s", e)
|
||||
|
||||
# ── Classify intent + parse ──
|
||||
intent, parsed = _classify_and_parse(q)
|
||||
logger.debug("geocode: intent=%s parsed=%s", intent, parsed)
|
||||
|
||||
# ── Retrieve candidates ──
|
||||
candidates = []
|
||||
|
||||
if intent == 'ADDRESS':
|
||||
# Parallel: Netsyms (structured) + Photon (freetext with expanded query)
|
||||
netsyms_results = _retrieve_netsyms(parsed, limit=limit)
|
||||
photon_results = _retrieve_photon_freetext(
|
||||
parsed.get('expanded_query', q), limit=limit
|
||||
)
|
||||
# Also try Photon /structured for addresses
|
||||
photon_struct = _retrieve_photon_structured(parsed, limit=5)
|
||||
candidates = netsyms_results + photon_results + photon_struct
|
||||
|
||||
elif intent == 'POSTCODE':
|
||||
netsyms_results = _retrieve_netsyms(parsed, limit=limit)
|
||||
photon_results = _retrieve_photon_freetext(q, limit=limit)
|
||||
candidates = netsyms_results + photon_results
|
||||
|
||||
elif intent in ('LOCALITY', 'POI', 'UNKNOWN'):
|
||||
candidates = _retrieve_photon_freetext(q, limit=limit)
|
||||
|
||||
# ── Deduplicate by (lat, lon) proximity ──
|
||||
deduped = []
|
||||
for c in candidates:
|
||||
is_dup = False
|
||||
for existing in deduped:
|
||||
if (_haversine_m(c['lat'], c['lon'], existing['lat'], existing['lon']) < 50
|
||||
and c.get('source') == existing.get('source')):
|
||||
is_dup = True
|
||||
break
|
||||
if not is_dup:
|
||||
deduped.append(c)
|
||||
candidates = deduped
|
||||
|
||||
# ── Rerank ──
|
||||
results = _rerank(candidates, parsed, intent, q, limit)
|
||||
|
||||
# ── Address book annotation ──
|
||||
_annotate_with_address_book(results)
|
||||
|
||||
logger.info("geocode: %r → intent=%s, %d results", q, intent, len(results))
|
||||
return {'query': q, 'results': results, 'count': len(results)}
|
||||
|
|
@ -20,25 +20,24 @@ TESTS = [
|
|||
),
|
||||
},
|
||||
{
|
||||
"name": "214 north st filer → photon results (multi-word, not nickname)",
|
||||
"name": "214 north st filer → netsyms exact match (multi-word, not nickname)",
|
||||
"query": "214 north st filer",
|
||||
"check": lambda r: (
|
||||
r["count"] >= 1
|
||||
and r["results"][0]["source"] == "photon"
|
||||
# labeled_as=Home may or may not appear depending on Photon's
|
||||
# geocoding precision — the key invariant is that this multi-word
|
||||
# query flows through Photon, not the address book shortcut.
|
||||
and r["results"][0]["source"] == "netsyms"
|
||||
and r["results"][0]["confidence"] == "exact"
|
||||
and r["results"][0]["type"] == "street_address"
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "214 North St, Filer, ID → photon (case/punctuation)",
|
||||
"name": "214 North St, Filer, ID → netsyms (case/punctuation)",
|
||||
"query": "214 North St, Filer, ID",
|
||||
"check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon",
|
||||
"check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms",
|
||||
},
|
||||
{
|
||||
"name": "214 NORTH ST FILER ID → photon (uppercase)",
|
||||
"name": "214 NORTH ST FILER ID → netsyms (uppercase)",
|
||||
"query": "214 NORTH ST FILER ID",
|
||||
"check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "photon",
|
||||
"check": lambda r: r["count"] >= 1 and r["results"][0]["source"] == "netsyms",
|
||||
},
|
||||
{
|
||||
"name": "1600 Pennsylvania Ave Washington DC → White House",
|
||||
|
|
|
|||
193
lib/nav_tools.py
193
lib/nav_tools.py
|
|
@ -50,86 +50,14 @@ def _haversine_m(lat1, lon1, lat2, lon2):
|
|||
return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
|
||||
|
||||
|
||||
def _classify_photon_feature(props, index):
|
||||
"""Classify a Photon feature into (type, confidence)."""
|
||||
osm_key = props.get('osm_key', '')
|
||||
osm_value = props.get('osm_value', '')
|
||||
feat_type = props.get('type', '')
|
||||
has_housenumber = bool(props.get('housenumber'))
|
||||
|
||||
# Type classification
|
||||
if has_housenumber or osm_value in ('house', 'residential'):
|
||||
result_type = 'street_address'
|
||||
elif feat_type in ('city', 'town', 'village', 'hamlet', 'county', 'state', 'country'):
|
||||
result_type = 'locality'
|
||||
elif osm_key in ('amenity', 'shop', 'tourism', 'leisure') or osm_value:
|
||||
result_type = 'poi'
|
||||
else:
|
||||
result_type = 'poi'
|
||||
|
||||
# Confidence — simple positional heuristic
|
||||
if index == 0:
|
||||
confidence = 'high'
|
||||
elif index <= 2:
|
||||
confidence = 'medium'
|
||||
else:
|
||||
confidence = 'low'
|
||||
|
||||
return result_type, confidence
|
||||
|
||||
|
||||
def _photon_feature_to_name(props):
|
||||
"""Build a display name from a Photon feature's properties."""
|
||||
parts = []
|
||||
housenumber = props.get('housenumber')
|
||||
street = props.get('street')
|
||||
name = props.get('name', '')
|
||||
|
||||
if housenumber and street:
|
||||
parts.append(f"{housenumber} {street}")
|
||||
if name and name != street:
|
||||
parts.append(name)
|
||||
elif name:
|
||||
parts.append(name)
|
||||
elif street:
|
||||
parts.append(street)
|
||||
|
||||
for key in ('city', 'county', 'state', 'country'):
|
||||
v = props.get(key)
|
||||
if v and (not parts or v != parts[-1]):
|
||||
parts.append(v)
|
||||
|
||||
return ', '.join(p for p in parts if p) or 'Unknown'
|
||||
|
||||
|
||||
def _annotate_with_address_book(results):
|
||||
"""Add labeled_as to results within ADDRESS_BOOK_ANNOTATION_RADIUS_M of an address book entry."""
|
||||
try:
|
||||
from . import address_book
|
||||
entries = address_book.load()
|
||||
except Exception:
|
||||
return
|
||||
|
||||
for result in results:
|
||||
rlat, rlon = result.get('lat'), result.get('lon')
|
||||
if rlat is None or rlon is None:
|
||||
continue
|
||||
for entry in entries:
|
||||
elat, elon = entry.get('lat'), entry.get('lon')
|
||||
if elat is None or elon is None:
|
||||
continue
|
||||
dist = _haversine_m(rlat, rlon, elat, elon)
|
||||
if dist <= ADDRESS_BOOK_ANNOTATION_RADIUS_M:
|
||||
result['labeled_as'] = entry['name']
|
||||
break
|
||||
def geocode(query: str, limit: int = 10):
|
||||
"""Delegate to the structured geocode module. See lib/geocode.py."""
|
||||
from . import geocode as geocode_mod
|
||||
return geocode_mod.geocode(query, limit=limit)
|
||||
|
||||
|
||||
def _geocode(query: str):
|
||||
"""Geocode a place name via address book then Photon. Returns (lat, lon, display_name) or raises.
|
||||
|
||||
Used internally by route() — returns a simple (lat, lon, name) tuple.
|
||||
For the full ranked-results API, use geocode() instead.
|
||||
"""
|
||||
"""Internal: returns (lat, lon, display_name) tuple for route()."""
|
||||
result = geocode(query, limit=1)
|
||||
results = result.get('results', [])
|
||||
if not results:
|
||||
|
|
@ -138,117 +66,6 @@ def _geocode(query: str):
|
|||
return top['lat'], top['lon'], top['name']
|
||||
|
||||
|
||||
|
||||
def geocode(query: str, limit: int = 10):
|
||||
"""
|
||||
Photon-first geocoding with ranked results.
|
||||
|
||||
Chain:
|
||||
1. Coordinate detection (pre-search)
|
||||
2. Address book nickname short-circuit (single-word queries only)
|
||||
3. Photon search (primary, biased to Idaho region)
|
||||
4. Address book proximity annotation (post-Photon, 75m radius)
|
||||
|
||||
Returns dict: {query, results: [...], count: N}
|
||||
Always 200-safe — empty results list is valid, never raises.
|
||||
|
||||
Netsyms is preserved at /api/netsyms/lookup for direct structured
|
||||
access. Enrichment of Photon street-address hits with USPS plus4
|
||||
from Netsyms is a planned follow-up (not wired here).
|
||||
"""
|
||||
limit = max(1, min(limit, 20))
|
||||
q = (query or '').strip()
|
||||
empty = {'query': q, 'results': [], 'count': 0}
|
||||
|
||||
if not q:
|
||||
return empty
|
||||
|
||||
# ── 1. Coordinate detection ──
|
||||
coords = _parse_coords(q)
|
||||
if coords:
|
||||
return {
|
||||
'query': q,
|
||||
'results': [{
|
||||
'name': q,
|
||||
'lat': coords[0],
|
||||
'lon': coords[1],
|
||||
'source': 'coordinates',
|
||||
'confidence': 'exact',
|
||||
'type': 'coordinates',
|
||||
'raw': None,
|
||||
}],
|
||||
'count': 1,
|
||||
}
|
||||
|
||||
# ── 2. Address book nickname short-circuit ──
|
||||
# Only short-circuit on single-word queries ("home", "work").
|
||||
# Multi-word queries fall through to Photon for proper ranking.
|
||||
normalized_q = ' '.join(q.lower().replace(',', ' ').split())
|
||||
is_single_word = ' ' not in normalized_q
|
||||
try:
|
||||
from . import address_book
|
||||
ab_match = address_book.lookup(q)
|
||||
if (ab_match
|
||||
and ab_match['confidence'] == 'exact'
|
||||
and ab_match.get('lat') and ab_match.get('lon')
|
||||
and is_single_word):
|
||||
logger.info("geocode: nickname short-circuit %r → %s", q, ab_match['name'])
|
||||
return {
|
||||
'query': q,
|
||||
'results': [{
|
||||
'name': ab_match.get('address') or ab_match['name'],
|
||||
'lat': ab_match['lat'],
|
||||
'lon': ab_match['lon'],
|
||||
'source': 'address_book',
|
||||
'confidence': 'exact',
|
||||
'type': 'nickname',
|
||||
'raw': ab_match,
|
||||
}],
|
||||
'count': 1,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug("geocode: address_book lookup failed: %s", e)
|
||||
|
||||
# ── 3. Photon search (primary) ──
|
||||
results = []
|
||||
try:
|
||||
params = {
|
||||
'q': q,
|
||||
'limit': limit,
|
||||
'lat': GEOCODE_BIAS_LAT,
|
||||
'lon': GEOCODE_BIAS_LON,
|
||||
'zoom': GEOCODE_BIAS_ZOOM,
|
||||
}
|
||||
resp = requests.get(f"{PHOTON_URL}/api", params=params, timeout=5)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
for i, feature in enumerate(data.get('features', [])):
|
||||
props = feature.get('properties', {})
|
||||
geom_coords = feature.get('geometry', {}).get('coordinates', [0, 0])
|
||||
result_type, confidence = _classify_photon_feature(props, i)
|
||||
name = _photon_feature_to_name(props)
|
||||
results.append({
|
||||
'name': name,
|
||||
'lat': geom_coords[1],
|
||||
'lon': geom_coords[0],
|
||||
'source': 'photon',
|
||||
'confidence': confidence,
|
||||
'type': result_type,
|
||||
'raw': props,
|
||||
})
|
||||
except requests.RequestException as e:
|
||||
logger.warning("geocode: Photon request failed: %s", e)
|
||||
except Exception as e:
|
||||
logger.warning("geocode: Photon parse error: %s", e)
|
||||
|
||||
# ── 4. Address book annotation (post-Photon) ──
|
||||
_annotate_with_address_book(results)
|
||||
|
||||
logger.info("geocode: %r → %d results", q, len(results))
|
||||
return {'query': q, 'results': results, 'count': len(results)}
|
||||
|
||||
|
||||
def reverse_geocode(lat: float, lon: float) -> str:
|
||||
"""Reverse geocode coordinates via Photon. Returns formatted address string."""
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue