Add business_intent_poi_boost reranker signal

When a query contains no road-type keywords (st, blvd, ave, etc.),
boost amenity/shop/tourism/leisure/office/craft results (+3.0) and
penalize highway/route results (-4.0). This fixes searches like
"starbucks twin falls" where a named service road outranked the
actual business POI due to Photon position tiebreaking.

Also fixes:
- Intent classifier now recognizes full state names ("idaho" not
  just "ID") for LOCALITY classification
- Locality-type Photon results now populate _city from name field
  so they participate in locality_fuzz scoring
- Trace logging expanded to all candidates with osm_key/value

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-21 19:39:37 +00:00
commit 620f99c762

View file

@ -51,6 +51,8 @@ W_SOURCE_AUTHORITY = 2.0 # Netsyms for US addresses
W_LAYER_RANK = 1.0 # type-appropriate results ranked higher W_LAYER_RANK = 1.0 # type-appropriate results ranked higher
W_PHOTON_POSITION_NORM = 1.0 # Photon's native ranking (normalized by position) W_PHOTON_POSITION_NORM = 1.0 # Photon's native ranking (normalized by position)
W_STATE_EXACT = 1.0 # exact state code match W_STATE_EXACT = 1.0 # exact state code match
W_POI_CLASS_BOOST = 3.0 # amenity/shop/etc boost for business-name queries
W_HIGHWAY_CLASS_PENALTY = -4.0 # highway/route penalty for business-name queries
# ── US abbreviation expansions ── # ── US abbreviation expansions ──
# Applied ONLY to parsed StreetName/StreetNamePostType tokens, NOT to ordinals. # Applied ONLY to parsed StreetName/StreetNamePostType tokens, NOT to ordinals.
@ -66,6 +68,13 @@ _DIRECTIONAL_ABBREVS = {
} }
_ORDINAL_RE = re.compile(r'^\d+(st|nd|rd|th)$', re.IGNORECASE) _ORDINAL_RE = re.compile(r'^\d+(st|nd|rd|th)$', re.IGNORECASE)
# ── Road keywords (for detecting when query is about a road vs a business) ──
_ROAD_KEYWORDS = (
set(_STREET_TYPE_ABBREVS.keys())
| set(_STREET_TYPE_ABBREVS.values())
| {'route', 'rte', 'pass'}
)
# ── US state codes ── # ── US state codes ──
_STATE_CODES = { _STATE_CODES = {
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
@ -75,6 +84,24 @@ _STATE_CODES = {
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC',
} }
# ── Full state name → code (for intent classifier) ──
_STATE_NAME_TO_CODE = {
'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR',
'california': 'CA', 'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE',
'florida': 'FL', 'georgia': 'GA', 'hawaii': 'HI', 'idaho': 'ID',
'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA', 'kansas': 'KS',
'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN',
'mississippi': 'MS', 'missouri': 'MO', 'montana': 'MT', 'nebraska': 'NE',
'nevada': 'NV', 'new hampshire': 'NH', 'new jersey': 'NJ',
'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC',
'north dakota': 'ND', 'ohio': 'OH', 'oklahoma': 'OK', 'oregon': 'OR',
'pennsylvania': 'PA', 'rhode island': 'RI', 'south carolina': 'SC',
'south dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT',
'vermont': 'VT', 'virginia': 'VA', 'washington': 'WA',
'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY',
}
# Coordinate regex # Coordinate regex
_COORD_RE = re.compile(r'^\s*(-?\d+\.?\d*)\s*[,\s]\s*(-?\d+\.?\d*)\s*$') _COORD_RE = re.compile(r'^\s*(-?\d+\.?\d*)\s*[,\s]\s*(-?\d+\.?\d*)\s*$')
@ -208,12 +235,26 @@ def _classify_and_parse(query):
elif zipcode and not number and not street_name: elif zipcode and not number and not street_name:
return 'POSTCODE', parsed return 'POSTCODE', parsed
elif addr_type == 'Ambiguous': elif addr_type == 'Ambiguous':
# Check if it looks like a locality: 2 tokens, second is a state code # Check if it looks like a locality: last token(s) are a state code or name
tokens = q.replace(',', ' ').split() tokens = q.replace(',', ' ').split()
if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES: if len(tokens) >= 2:
parsed['city'] = ' '.join(tokens[:-1]) last_upper = tokens[-1].upper()
parsed['state'] = tokens[-1].upper() if last_upper in _STATE_CODES:
return 'LOCALITY', parsed parsed['city'] = ' '.join(tokens[:-1])
parsed['state'] = last_upper
return 'LOCALITY', parsed
# Check full state names (single-word like "idaho" or two-word like "new york")
last_lower = tokens[-1].lower()
if last_lower in _STATE_NAME_TO_CODE:
parsed['city'] = ' '.join(tokens[:-1])
parsed['state'] = _STATE_NAME_TO_CODE[last_lower]
return 'LOCALITY', parsed
if len(tokens) >= 3:
two_word = f"{tokens[-2].lower()} {last_lower}"
if two_word in _STATE_NAME_TO_CODE:
parsed['city'] = ' '.join(tokens[:-2])
parsed['state'] = _STATE_NAME_TO_CODE[two_word]
return 'LOCALITY', parsed
return 'UNKNOWN', parsed return 'UNKNOWN', parsed
else: else:
return 'UNKNOWN', parsed return 'UNKNOWN', parsed
@ -363,7 +404,8 @@ def _parse_photon_features(features, source):
'_photon_rank': i, '_photon_rank': i,
'_number': props.get('housenumber', ''), '_number': props.get('housenumber', ''),
'_street': props.get('street', ''), '_street': props.get('street', ''),
'_city': props.get('city', ''), # For locality results, the name IS the city (Photon omits 'city' on city-type features)
'_city': props.get('city', '') or (props.get('name', '') if rtype == 'locality' else ''),
'_state': props.get('state', ''), '_state': props.get('state', ''),
}) })
return results return results
@ -476,6 +518,21 @@ def _score_candidate(candidate, parsed, intent):
signals['photon_position'] = round(score, 2) signals['photon_position'] = round(score, 2)
total += score total += score
# ── Business intent POI boost ──
# When the query has no road keywords (likely a business/POI search),
# boost amenity/shop/etc results and penalize highway/route results.
# Skipped for LOCALITY, POSTCODE, COORD queries where class is irrelevant.
if intent not in ('LOCALITY', 'POSTCODE', 'COORD'):
q_tokens_lower = set(parsed.get('raw_query', '').lower().replace(',', ' ').split())
if not (q_tokens_lower & _ROAD_KEYWORDS):
osm_key = (candidate.get('raw') or {}).get('osm_key', '')
if osm_key in ('amenity', 'shop', 'tourism', 'leisure', 'office', 'craft'):
signals['poi_class_boost'] = W_POI_CLASS_BOOST
total += W_POI_CLASS_BOOST
elif osm_key in ('highway', 'route'):
signals['highway_class_penalty'] = W_HIGHWAY_CLASS_PENALTY
total += W_HIGHWAY_CLASS_PENALTY
return round(total, 2), signals return round(total, 2), signals
@ -526,10 +583,13 @@ def _rerank(candidates, parsed, intent, query, limit):
# Trace log for audit # Trace log for audit
_trace_logger.debug("─── Query: %r intent=%s ───", query, intent) _trace_logger.debug("─── Query: %r intent=%s ───", query, intent)
for i, c in enumerate(scored[:3]): for i, c in enumerate(scored):
osm_key = (c.get('raw') or {}).get('osm_key', '')
osm_val = (c.get('raw') or {}).get('osm_value', '')
_trace_logger.debug( _trace_logger.debug(
" #%d score=%.2f src=%s name=%s", " #%d score=%.2f src=%s key=%s/%s name=%s",
i, c['_score'], c.get('source', '?'), c.get('name', '?')[:60] i, c['_score'], c.get('source', '?'), osm_key, osm_val,
c.get('name', '?')[:60]
) )
_trace_logger.debug(" signals=%s", c.get('_signals', {})) _trace_logger.debug(" signals=%s", c.get('_signals', {}))