recon/scripts/wiki_index_wave3.py
Ubuntu 5d618da2a4 Add wiki_index_wave3.py with parallel resolve
Wave 3 pipeline for processing 253K+ place types with NO wiki/wikidata
tags (US+CA only). Uses Gemini to resolve Wikipedia titles.

Key feature: resolve_wikipedia_titles() now uses ThreadPoolExecutor
with 5 parallel workers, improving throughput from ~14/min to ~75/min.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-30 21:37:51 +00:00

932 lines
32 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Wiki Location Index Pipeline — Wave 3
Processes high-coverage place types with NO wiki/wikidata tags (US+CA only).
Uses Gemini to resolve Wikipedia titles.
CRITICAL: Every Gemini call commits immediately. DB is the checkpoint.
Usage:
python wiki_index_wave3.py extract # Extract from JSONL
python wiki_index_wave3.py resolve # Resolve Wikipedia titles via Gemini
python wiki_index_wave3.py validate # Validate titles against ZIM
python wiki_index_wave3.py wikivoyage # Resolve+validate Wikivoyage titles
python wiki_index_wave3.py summarize # Generate summaries with Gemini
"""
import os
import sys
import json
import sqlite3
import logging
import time
import resource
import threading
import signal
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import zstandard as zstd
from bs4 import BeautifulSoup
from google import genai
from google.genai import types
import requests
# =============================================================================
# CONFIGURATION
# =============================================================================
JSONL_PATH = "/mnt/pi-nas/nav/photon-dump-planet.jsonl.zst"
DB_PATH = "/mnt/pi-nas/nav/wiki-index/data/wiki_index.db"
LOG_DIR = "/mnt/pi-nas/nav/wiki-index/logs"
GEMINI_JSONL = f"{LOG_DIR}/wave3_gemini.jsonl"
# US + Canada line range
COMBINED_START = 53694616
COMBINED_END = 175406527
# ZIM endpoints (VM 1130)
WIKIPEDIA_INTERNAL = "http://192.168.1.130:8430/wikipedia_en_all_maxi_2026-02"
WIKIVOYAGE_INTERNAL = "http://192.168.1.130:8430/wikivoyage_en_all_maxi_2026-03"
# Gemini
GEMINI_MODEL = "gemini-2.5-flash"
RESOLVE_WORKERS = 5
# Memory limit
MAX_RSS_MB = 10240 # 10GB
# Circuit breaker settings
CIRCUIT_BREAKER_CONSECUTIVE_429 = 10
CIRCUIT_BREAKER_PAUSE_1 = 300 # 5 minutes
CIRCUIT_BREAKER_PAUSE_2 = 600 # 10 minutes
CIRCUIT_BREAKER_MAX_PAUSES = 3
# Wave 3 type filter
WAVE3_TYPES = {
("place", "city"), ("place", "town"), ("place", "village"),
("place", "hamlet"), ("place", "borough"), ("place", "suburb"),
("boundary", "administrative"),
("natural", "peak"), ("natural", "volcano"), ("natural", "bay"),
("natural", "cape"), ("natural", "glacier"),
("leisure", "nature_reserve"), ("boundary", "national_park"),
("tourism", "museum"), ("amenity", "university"), ("aeroway", "aerodrome"),
}
TRAVEL_TYPES = {
("place", "city"), ("place", "town"), ("place", "village"),
("boundary", "national_park"), ("leisure", "nature_reserve"),
}
# Graceful shutdown flag
_shutdown_requested = False
_current_place_id = None
def signal_handler(signum, frame):
global _shutdown_requested
_shutdown_requested = True
log.warning(f"Shutdown requested (signal {signum}), finishing current place...")
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Thread-safe HTTP session
_http_session = None
_http_lock = threading.Lock()
def get_http_session():
global _http_session
if _http_session is None:
with _http_lock:
if _http_session is None:
_http_session = requests.Session()
_http_session.headers.update({"User-Agent": "Echo6WikiIndex/3.0"})
return _http_session
# =============================================================================
# LOGGING
# =============================================================================
def setup_logging():
Path(LOG_DIR).mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"{LOG_DIR}/wave3_{ts}.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[logging.FileHandler(log_file), logging.StreamHandler()]
)
return logging.getLogger(__name__)
log = setup_logging()
# =============================================================================
# GEMINI AUDIT LOG
# =============================================================================
_gemini_log_lock = threading.Lock()
def log_gemini_response(place_id, stage, prompt, response, error=None):
"""Append every Gemini call to audit log."""
entry = {
"timestamp": datetime.utcnow().isoformat(),
"place_id": place_id,
"stage": stage,
"prompt": prompt[:500], # Truncate long prompts
"response": response,
"error": str(error) if error else None,
}
with _gemini_log_lock:
with open(GEMINI_JSONL, "a") as f:
f.write(json.dumps(entry) + "\n")
# =============================================================================
# MEMORY MONITORING
# =============================================================================
def check_memory(context=""):
rss_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
rss_mb = rss_kb / 1024 if sys.platform != 'darwin' else rss_kb / 1024 / 1024
if rss_mb > MAX_RSS_MB:
log.error(f"RSS {rss_mb:.0f}MB exceeds {MAX_RSS_MB}MB at {context}, aborting")
sys.exit(1)
return rss_mb
# =============================================================================
# DATABASE
# =============================================================================
def get_db():
conn = sqlite3.connect(DB_PATH)
# Ensure wiki_failures table exists
conn.execute("""
CREATE TABLE IF NOT EXISTS wiki_failures (
id INTEGER PRIMARY KEY AUTOINCREMENT,
place_id INTEGER,
stage TEXT,
error_type TEXT,
error_message TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
return conn
def log_failure(conn, place_id, stage, error):
"""Log a failure to wiki_failures table."""
error_type = type(error).__name__
error_msg = str(error)[:1000]
conn.execute(
"INSERT INTO wiki_failures (place_id, stage, error_type, error_message) VALUES (?, ?, ?, ?)",
(place_id, stage, error_type, error_msg)
)
conn.commit()
def get_existing_place_keys(conn):
c = conn.cursor()
c.execute("""
SELECT place_name, osm_key, osm_value,
COALESCE(county,''), COALESCE(state,''), country_code
FROM wiki_places
""")
return set(tuple(row) for row in c.fetchall())
# =============================================================================
# CIRCUIT BREAKER
# =============================================================================
class CircuitBreaker:
def __init__(self):
self.consecutive_429s = 0
self.pause_count = 0
self.lock = threading.Lock()
def record_success(self):
with self.lock:
self.consecutive_429s = 0
def record_429(self):
"""Returns True if we should abort, False if we can continue."""
with self.lock:
self.consecutive_429s += 1
if self.consecutive_429s >= CIRCUIT_BREAKER_CONSECUTIVE_429:
self.pause_count += 1
if self.pause_count > CIRCUIT_BREAKER_MAX_PAUSES:
log.error(f"Circuit breaker: {CIRCUIT_BREAKER_MAX_PAUSES} pauses failed, aborting gracefully")
return True # Signal to abort
pause_time = CIRCUIT_BREAKER_PAUSE_1 if self.pause_count == 1 else CIRCUIT_BREAKER_PAUSE_2
log.warning(f"Circuit breaker: {self.consecutive_429s} consecutive 429s, pausing {pause_time}s (pause {self.pause_count}/{CIRCUIT_BREAKER_MAX_PAUSES})")
time.sleep(pause_time)
self.consecutive_429s = 0
return False # Continue
circuit_breaker = CircuitBreaker()
# =============================================================================
# RESPONSE VALIDATION
# =============================================================================
def validate_gemini_response(text):
"""Validate response is not empty, error, or HTML."""
if not text:
return False, "empty response"
if text.startswith("<!") or text.startswith("<html"):
return False, "HTML response"
if "error" in text.lower()[:50] and len(text) < 100:
return False, "error message"
return True, None
# =============================================================================
# STAGE 1: EXTRACT
# =============================================================================
def should_include_wave3(osm_key, osm_value, extra):
if extra.get("wikipedia") or extra.get("wikidata"):
return False
if (osm_key, osm_value) not in WAVE3_TYPES:
return False
if osm_key == "boundary" and osm_value == "administrative":
admin_level = extra.get("admin_level")
if admin_level:
try:
level = int(admin_level)
if level < 4 or level > 8:
return False
except ValueError:
return False
return True
def extract_from_jsonl():
"""Extract Wave 3 places from JSONL."""
conn = get_db()
c = conn.cursor()
log.info("Loading existing place keys...")
existing = get_existing_place_keys(conn)
log.info(f"Found {len(existing):,} existing places")
seen = set()
inserted = 0
skipped_type = 0
skipped_wiki = 0
skipped_no_name = 0
skipped_dupe = 0
skipped_existing = 0
log.info(f"Wave 3 extraction: lines {COMBINED_START:,} to {COMBINED_END:,}")
log.info(f"Type filter: {len(WAVE3_TYPES)} types")
with open(JSONL_PATH, 'rb') as fh:
dctx = zstd.ZstdDecompressor()
with dctx.stream_reader(fh) as reader:
import io
text_reader = io.TextIOWrapper(reader, encoding='utf-8')
line_num = 0
for line in text_reader:
line_num += 1
if line_num < COMBINED_START:
if line_num % 10_000_000 == 0:
log.info(f" Seeking... line {line_num:,}")
continue
if line_num > COMBINED_END:
break
if line_num % 1_000_000 == 0:
rss = check_memory(f"line {line_num}")
log.info(f" Line {line_num:,}, inserted {inserted:,}, RSS {rss:.0f}MB")
conn.commit()
try:
record = json.loads(line)
content = record.get("content", [{}])[0]
country_code = content.get("country_code", "")
if country_code not in ("us", "ca"):
continue
osm_key = content.get("osm_key", "")
osm_value = content.get("osm_value", "")
extra = content.get("extra", {})
if not should_include_wave3(osm_key, osm_value, extra):
if extra.get("wikipedia") or extra.get("wikidata"):
skipped_wiki += 1
else:
skipped_type += 1
continue
name_obj = content.get("name", {})
name = name_obj.get("name:en") or name_obj.get("name")
if not name:
skipped_no_name += 1
continue
address = content.get("address", {})
state = address.get("state") or address.get("state:en")
county = address.get("county") or address.get("county:en")
dedup_key = (name, osm_key, osm_value, county or "", state or "", country_code)
if dedup_key in existing:
skipped_existing += 1
continue
if dedup_key in seen:
skipped_dupe += 1
continue
seen.add(dedup_key)
osm_id = f"{content.get('object_type', '')}{content.get('object_id', '')}"
importance = content.get("importance")
extra_json = json.dumps({k: v for k, v in extra.items()
if k not in ("wikipedia", "wikidata")}) or None
c.execute("""
INSERT OR IGNORE INTO wiki_places
(place_name, osm_key, osm_value, county, state, country_code,
osm_id, importance, extra_json, source)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'gemini')
""", (name, osm_key, osm_value, county, state, country_code,
osm_id, importance, extra_json))
if c.rowcount > 0:
inserted += 1
if inserted % 10000 == 0:
conn.commit()
except json.JSONDecodeError:
continue
except Exception as e:
log.error(f"Error line {line_num}: {e}")
continue
conn.commit()
conn.close()
log.info(f"Wave 3 extraction complete:")
log.info(f" Inserted: {inserted:,}")
log.info(f" Skipped (has wiki/wikidata): {skipped_wiki:,}")
log.info(f" Skipped (type filter): {skipped_type:,}")
log.info(f" Skipped (no name): {skipped_no_name:,}")
log.info(f" Skipped (duplicate): {skipped_dupe:,}")
log.info(f" Skipped (existing): {skipped_existing:,}")
return inserted
# =============================================================================
# STAGE 2: RESOLVE WIKIPEDIA TITLES VIA GEMINI
# =============================================================================
def resolve_worker(args):
"""Worker function for parallel Wikipedia title resolution."""
pid, name, state, country_code, client, circuit_breaker, db_lock = args
country = "United States" if country_code == "us" else "Canada"
location = f"{state}, {country}" if state else country
prompt = f"What is the exact Wikipedia article title for {name}, {location}? Reply with just the article title, or NONE if no article exists."
result = {
"place_id": pid,
"success": False,
"title": None,
"is_none": False,
"error": None,
"is_rate_limit": False
}
try:
response = client.models.generate_content(
model=GEMINI_MODEL,
contents=prompt,
config=types.GenerateContentConfig(max_output_tokens=100, temperature=0.1)
)
circuit_breaker.record_success()
text = response.text.strip() if response.text else ""
log_gemini_response(pid, "resolve", prompt, text)
# Validate response
valid, reason = validate_gemini_response(text)
if not valid:
result["error"] = f"Invalid response: {reason}"
return result
# Clean up and store
if text.upper() == "NONE":
result["success"] = True
result["is_none"] = True
result["title"] = None
else:
result["success"] = True
result["title"] = text.replace("**", "").strip().strip('"')
return result
except Exception as e:
err_str = str(e).lower()
log_gemini_response(pid, "resolve", prompt, None, error=e)
if "429" in err_str or "quota" in err_str or "rate" in err_str:
result["is_rate_limit"] = True
should_abort = circuit_breaker.record_429()
if should_abort:
result["error"] = "circuit_breaker_abort"
return result
result["error"] = str(e)
return result
def resolve_wikipedia_titles():
"""Resolve Wikipedia titles for Wave 3 places via Gemini (parallel)."""
global _shutdown_requested, _current_place_id
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
env_path = Path("/home/zvx/projects/wiki-index/.env")
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("GEMINI_API_KEY="):
api_key = line.split("=", 1)[1].strip().strip('"')
break
if not api_key:
log.error("GEMINI_API_KEY not set")
return
client = genai.Client(api_key=api_key)
conn = get_db()
c = conn.cursor()
db_lock = threading.Lock()
# STARTUP VERIFICATION: count already processed
c.execute("SELECT COUNT(*) FROM wiki_places WHERE source = 'gemini' AND wikipedia_title IS NOT NULL")
already_done = c.fetchone()[0]
# Get unprocessed rows (DB is the checkpoint)
c.execute("""
SELECT id, place_name, state, country_code FROM wiki_places
WHERE source = 'gemini' AND wikipedia_title IS NULL
ORDER BY id
""")
rows = c.fetchall()
total = len(rows)
log.info(f"[RESOLVE] Resuming: {already_done:,} already resolved, {total:,} remaining")
log.info(f"[RESOLVE] Using {RESOLVE_WORKERS} parallel workers")
if not rows:
log.info("[RESOLVE] No places need title resolution")
return
resolved = 0
no_article = 0
errors = 0
processed = 0
start_time = time.time()
abort_requested = False
# Prepare work items
work_items = [
(pid, name, state, country_code, client, circuit_breaker, db_lock)
for pid, name, state, country_code in rows
]
with ThreadPoolExecutor(max_workers=RESOLVE_WORKERS) as executor:
futures = {
executor.submit(resolve_worker, item): item[0]
for item in work_items
}
for future in as_completed(futures):
if _shutdown_requested or abort_requested:
log.info(f"[RESOLVE] Shutdown requested, cancelling remaining futures...")
executor.shutdown(wait=False, cancel_futures=True)
break
result = future.result()
processed += 1
pid = result["place_id"]
_current_place_id = pid
# Progress logging every 100
if processed % 100 == 0:
elapsed = time.time() - start_time
rate = processed / (elapsed / 60) if elapsed > 0 else 0
pct = (processed / total) * 100
log.info(f"[RESOLVE] Progress: {processed:,}/{total:,} ({pct:.1f}%) | Resolved: {resolved} | None: {no_article} | Errors: {errors} | Rate: {rate:.1f}/min")
check_memory(f"resolve {processed}")
if result["success"]:
# Thread-safe DB write
with db_lock:
c.execute("UPDATE wiki_places SET wikipedia_title = ? WHERE id = ?", (result["title"], pid))
conn.commit()
if result["is_none"]:
no_article += 1
else:
resolved += 1
else:
if result["error"] == "circuit_breaker_abort":
log.error(f"[RESOLVE] Circuit breaker abort at place_id {pid}")
abort_requested = True
else:
# Log failure
with db_lock:
log_failure(conn, pid, "resolve", Exception(result["error"]))
errors += 1
conn.close()
log.info(f"[RESOLVE] Complete: resolved={resolved:,}, none={no_article:,}, errors={errors:,}")
# =============================================================================
# STAGE 3: VALIDATE AGAINST ZIM
# =============================================================================
def validate_wikipedia_title(title):
if not title:
return False
encoded = requests.utils.quote(title.replace(" ", "_"))
url = f"{WIKIPEDIA_INTERNAL}/A/{encoded}"
try:
resp = get_http_session().head(url, timeout=5)
return resp.status_code == 200
except:
return False
def validate_titles():
"""Validate resolved Wikipedia titles against ZIM."""
conn = get_db()
c = conn.cursor()
# STARTUP VERIFICATION
c.execute("SELECT COUNT(*) FROM wiki_places WHERE source = 'gemini' AND wikipedia_exists IS NOT NULL")
already_done = c.fetchone()[0]
c.execute("""
SELECT id, wikipedia_title FROM wiki_places
WHERE source = 'gemini' AND wikipedia_title IS NOT NULL AND wikipedia_exists IS NULL
""")
rows = c.fetchall()
total = len(rows)
log.info(f"[VALIDATE] Resuming: {already_done:,} already validated, {total:,} remaining")
if not rows:
log.info("[VALIDATE] No titles need validation")
return
exists_count = 0
missing_count = 0
start_time = time.time()
for i, (pid, title) in enumerate(rows):
if _shutdown_requested:
log.info(f"[VALIDATE] Shutting down at place_id {pid}")
break
if i > 0 and i % 100 == 0:
elapsed = time.time() - start_time
rate = i / (elapsed / 60) if elapsed > 0 else 0
pct = (i / total) * 100
log.info(f"[VALIDATE] Progress: {i:,}/{total:,} ({pct:.1f}%) | Exists: {exists_count} | Missing: {missing_count} | Rate: {rate:.1f}/min")
exists = 1 if validate_wikipedia_title(title) else 0
c.execute("UPDATE wiki_places SET wikipedia_exists = ? WHERE id = ?", (exists, pid))
conn.commit()
if exists:
exists_count += 1
else:
missing_count += 1
conn.close()
log.info(f"[VALIDATE] Complete: exists={exists_count:,}, missing={missing_count:,}")
# =============================================================================
# STAGE 4: WIKIVOYAGE RESOLUTION
# =============================================================================
def validate_wikivoyage_title(title):
if not title:
return False
encoded = requests.utils.quote(title.replace(" ", "_"))
url = f"{WIKIVOYAGE_INTERNAL}/A/{encoded}"
try:
resp = get_http_session().head(url, timeout=5)
return resp.status_code == 200
except:
return False
def resolve_wikivoyage():
"""Resolve and validate Wikivoyage titles for travel-relevant types."""
global _shutdown_requested, _current_place_id
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
env_path = Path("/home/zvx/projects/wiki-index/.env")
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("GEMINI_API_KEY="):
api_key = line.split("=", 1)[1].strip().strip('"')
break
if not api_key:
log.error("GEMINI_API_KEY not set")
return
client = genai.Client(api_key=api_key)
conn = get_db()
c = conn.cursor()
# STARTUP VERIFICATION
c.execute("SELECT COUNT(*) FROM wiki_places WHERE source = 'gemini' AND wikivoyage_title IS NOT NULL")
already_done = c.fetchone()[0]
type_conditions = " OR ".join([f"(osm_key = ? AND osm_value = ?)" for _ in TRAVEL_TYPES])
params = [p for t in TRAVEL_TYPES for p in t]
c.execute(f"""
SELECT id, place_name, state, country_code FROM wiki_places
WHERE source = 'gemini'
AND wikipedia_exists = 1
AND wikivoyage_title IS NULL
AND ({type_conditions})
ORDER BY id
""", params)
rows = c.fetchall()
total = len(rows)
log.info(f"[WIKIVOYAGE] Resuming: {already_done:,} already resolved, {total:,} remaining")
if not rows:
log.info("[WIKIVOYAGE] No places need Wikivoyage resolution")
return
resolved = 0
validated = 0
errors = 0
start_time = time.time()
for i, (pid, name, state, country_code) in enumerate(rows):
if _shutdown_requested:
log.info(f"[WIKIVOYAGE] Shutting down at place_id {pid}")
break
_current_place_id = pid
if i > 0 and i % 100 == 0:
elapsed = time.time() - start_time
rate = i / (elapsed / 60) if elapsed > 0 else 0
pct = (i / total) * 100
log.info(f"[WIKIVOYAGE] Progress: {i:,}/{total:,} ({pct:.1f}%) | Resolved: {resolved} | Validated: {validated} | Errors: {errors} | Rate: {rate:.1f}/min")
country = "United States" if country_code == "us" else "Canada"
location = f"{state}, {country}" if state else country
prompt = f"What is the exact Wikivoyage article title for {name}, {location}? Reply with just the title, or NONE if no article exists."
try:
response = client.models.generate_content(
model=GEMINI_MODEL,
contents=prompt,
config=types.GenerateContentConfig(max_output_tokens=100, temperature=0.1)
)
circuit_breaker.record_success()
text = response.text.strip() if response.text else ""
log_gemini_response(pid, "wikivoyage", prompt, text)
valid, reason = validate_gemini_response(text)
if not valid:
log_failure(conn, pid, "wikivoyage", Exception(f"Invalid response: {reason}"))
c.execute("UPDATE wiki_places SET wikivoyage_exists = 0 WHERE id = ?", (pid,))
conn.commit()
errors += 1
continue
if text.upper() == "NONE":
c.execute("UPDATE wiki_places SET wikivoyage_exists = 0 WHERE id = ?", (pid,))
else:
title = text.replace("**", "").strip().strip('"')
exists = 1 if validate_wikivoyage_title(title) else 0
c.execute("UPDATE wiki_places SET wikivoyage_title = ?, wikivoyage_exists = ? WHERE id = ?",
(title, exists, pid))
resolved += 1
if exists:
validated += 1
conn.commit()
except Exception as e:
err_str = str(e).lower()
log_gemini_response(pid, "wikivoyage", prompt, None, error=e)
if "429" in err_str or "quota" in err_str:
should_abort = circuit_breaker.record_429()
if should_abort:
log.error(f"[WIKIVOYAGE] Circuit breaker abort at place_id {pid}")
break
log_failure(conn, pid, "wikivoyage", e)
errors += 1
continue
time.sleep(0.1)
conn.close()
log.info(f"[WIKIVOYAGE] Complete: resolved={resolved:,}, validated={validated:,}, errors={errors:,}")
# =============================================================================
# STAGE 5: SUMMARIZE
# =============================================================================
def fetch_wiki_content(title, is_wikivoyage=False):
base = WIKIVOYAGE_INTERNAL if is_wikivoyage else WIKIPEDIA_INTERNAL
encoded = requests.utils.quote(title.replace(" ", "_"))
url = f"{base}/A/{encoded}"
try:
resp = get_http_session().get(url, timeout=10)
if resp.status_code != 200:
return None
soup = BeautifulSoup(resp.text, 'html.parser')
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
return text[:15000]
except:
return None
def summarize():
"""Generate summaries for validated Wave 3 places."""
global _shutdown_requested, _current_place_id
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
env_path = Path("/home/zvx/projects/wiki-index/.env")
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("GEMINI_API_KEY="):
api_key = line.split("=", 1)[1].strip().strip('"')
break
if not api_key:
log.error("GEMINI_API_KEY not set")
return
client = genai.Client(api_key=api_key)
conn = get_db()
c = conn.cursor()
# STARTUP VERIFICATION
c.execute("SELECT COUNT(*) FROM wiki_places WHERE source = 'gemini' AND summary IS NOT NULL")
already_done = c.fetchone()[0]
c.execute("""
SELECT id, place_name, wikipedia_title, wikivoyage_title,
wikipedia_exists, wikivoyage_exists
FROM wiki_places
WHERE source = 'gemini'
AND (wikipedia_exists = 1 OR wikivoyage_exists = 1)
AND summary IS NULL
ORDER BY id
""")
rows = c.fetchall()
total = len(rows)
log.info(f"[SUMMARIZE] Resuming: {already_done:,} already summarized, {total:,} remaining")
if not rows:
log.info("[SUMMARIZE] No places need summaries")
return
success = 0
errors = 0
start_time = time.time()
for i, (pid, name, wp_title, wv_title, wp_exists, wv_exists) in enumerate(rows):
if _shutdown_requested:
log.info(f"[SUMMARIZE] Shutting down at place_id {pid}")
break
_current_place_id = pid
if i > 0 and i % 100 == 0:
elapsed = time.time() - start_time
rate = i / (elapsed / 60) if elapsed > 0 else 0
pct = (i / total) * 100
log.info(f"[SUMMARIZE] Progress: {i:,}/{total:,} ({pct:.1f}%) | Success: {success} | Errors: {errors} | Rate: {rate:.1f}/min")
check_memory(f"summarize {i}")
# Fetch content
content = None
if wp_exists and wp_title:
content = fetch_wiki_content(wp_title, is_wikivoyage=False)
if not content and wv_exists and wv_title:
content = fetch_wiki_content(wv_title, is_wikivoyage=True)
if not content:
log_failure(conn, pid, "summarize", Exception("No content fetched"))
errors += 1
continue
prompt = f"""Summarize this Wikipedia article about {name} in 2-3 sentences for a map application. Focus on what makes it notable or interesting to visitors. Be concise.
Article:
{content[:8000]}"""
try:
response = client.models.generate_content(
model=GEMINI_MODEL,
contents=prompt,
config=types.GenerateContentConfig(max_output_tokens=500, temperature=0.3)
)
circuit_breaker.record_success()
text = response.text.strip() if response.text else ""
log_gemini_response(pid, "summarize", f"Summarize {name}", text)
valid, reason = validate_gemini_response(text)
if not valid:
log_failure(conn, pid, "summarize", Exception(f"Invalid response: {reason}"))
errors += 1
continue
# COMMIT BEFORE NEXT
c.execute("UPDATE wiki_places SET summary = ? WHERE id = ?", (text, pid))
conn.commit()
success += 1
except Exception as e:
err_str = str(e).lower()
log_gemini_response(pid, "summarize", f"Summarize {name}", None, error=e)
if "429" in err_str or "quota" in err_str:
should_abort = circuit_breaker.record_429()
if should_abort:
log.error(f"[SUMMARIZE] Circuit breaker abort at place_id {pid}")
break
log_failure(conn, pid, "summarize", e)
errors += 1
continue
time.sleep(0.1)
conn.close()
log.info(f"[SUMMARIZE] Complete: success={success:,}, errors={errors:,}")
# =============================================================================
# MAIN
# =============================================================================
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
cmd = sys.argv[1]
if cmd == "extract":
count = extract_from_jsonl()
log.info(f"\n*** EXTRACT COMPLETE: {count:,} places ***")
log.info("Review count before proceeding to resolve stage.")
elif cmd == "resolve":
resolve_wikipedia_titles()
elif cmd == "validate":
validate_titles()
elif cmd == "wikivoyage":
resolve_wikivoyage()
elif cmd == "summarize":
summarize()
elif cmd == "all":
count = extract_from_jsonl()
log.info(f"Extracted {count:,} places")
if not _shutdown_requested:
resolve_wikipedia_titles()
if not _shutdown_requested:
validate_titles()
if not _shutdown_requested:
resolve_wikivoyage()
if not _shutdown_requested:
summarize()
else:
print(f"Unknown command: {cmd}")
print(__doc__)
sys.exit(1)
if __name__ == "__main__":
main()