mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
1128 lines
39 KiB
Python
1128 lines
39 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Wiki Location Index Pipeline — Wave 2
|
||
|
|
Processes places with extra.wikidata but NO extra.wikipedia tags from Photon JSONL dump.
|
||
|
|
Resolves Wikipedia titles via Wikidata API.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python wiki_index_wave2.py extract # Extract from JSONL (wikidata only)
|
||
|
|
python wiki_index_wave2.py resolve # Resolve Wikipedia titles via Wikidata
|
||
|
|
python wiki_index_wave2.py validate # Validate titles against ZIM
|
||
|
|
python wiki_index_wave2.py summarize # Generate summaries with Gemini
|
||
|
|
python wiki_index_wave2.py summarize --workers=10 # Use 10 concurrent workers
|
||
|
|
python wiki_index_wave2.py summarize --dry-run # Process only 5 places (test run)
|
||
|
|
python wiki_index_wave2.py revalidate # Re-validate corrected titles
|
||
|
|
python wiki_index_wave2.py all # Run all stages
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import sqlite3
|
||
|
|
import logging
|
||
|
|
import time
|
||
|
|
import resource
|
||
|
|
import threading
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
|
|
||
|
|
import zstandard as zstd
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
from google import genai
|
||
|
|
from google.genai import types
|
||
|
|
import requests
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# CONFIGURATION
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
# Paths
|
||
|
|
JSONL_PATH = "/mnt/pi-nas/nav/photon-dump-planet.jsonl.zst"
|
||
|
|
DB_PATH = "/mnt/pi-nas/nav/wiki-index/data/wiki_index.db"
|
||
|
|
LOG_DIR = "/mnt/pi-nas/nav/wiki-index/logs"
|
||
|
|
CHECKPOINT_DIR = "/mnt/pi-nas/nav/wiki-index/data"
|
||
|
|
GEMINI_LOG = f"{LOG_DIR}/gemini_responses_wave2.jsonl"
|
||
|
|
|
||
|
|
# Checkpoint files (wave 2 specific)
|
||
|
|
EXTRACT_CHECKPOINT = f"{CHECKPOINT_DIR}/wave2_extract_checkpoint.txt"
|
||
|
|
RESOLVE_CHECKPOINT = f"{CHECKPOINT_DIR}/wave2_resolve_checkpoint.txt"
|
||
|
|
VALIDATE_CHECKPOINT = f"{CHECKPOINT_DIR}/wave2_validate_checkpoint.txt"
|
||
|
|
SUMMARIZE_CHECKPOINT = f"{CHECKPOINT_DIR}/wave2_summarize_checkpoint.txt"
|
||
|
|
|
||
|
|
# Single-pass line range for US + Canada
|
||
|
|
COMBINED_START = 53694616
|
||
|
|
COMBINED_END = 175406527
|
||
|
|
|
||
|
|
# ZIM endpoints
|
||
|
|
WIKIPEDIA_INTERNAL = "http://192.168.1.130:8430/wikipedia_en_all_maxi_2026-02"
|
||
|
|
WIKIVOYAGE_INTERNAL = "http://192.168.1.130:8430/wikivoyage_en_all_maxi_2026-03"
|
||
|
|
|
||
|
|
# Wikidata API
|
||
|
|
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
|
||
|
|
WIKIDATA_BATCH_SIZE = 50
|
||
|
|
|
||
|
|
# Gemini
|
||
|
|
GEMINI_MODEL = "gemini-2.5-flash"
|
||
|
|
MAX_RETRIES = 3
|
||
|
|
RETRY_DELAYS = [1, 5, 30]
|
||
|
|
|
||
|
|
# Concurrency
|
||
|
|
VALIDATION_WORKERS = 4
|
||
|
|
SUMMARIZE_WORKERS = 5
|
||
|
|
|
||
|
|
# Memory limit (MB)
|
||
|
|
MAX_RSS_MB = 10240 # 10GB
|
||
|
|
|
||
|
|
# Circuit breaker
|
||
|
|
CIRCUIT_BREAKER_THRESHOLD = 50 # 50 consecutive 429s
|
||
|
|
CIRCUIT_BREAKER_PAUSE = 300 # 5 minutes
|
||
|
|
|
||
|
|
# Included types (same as wave 1)
|
||
|
|
INCLUDE_KEYS = {
|
||
|
|
"place": {"city", "town", "village", "hamlet", "suburb", "island", "islet",
|
||
|
|
"state", "county", "region", "locality"},
|
||
|
|
"natural": {"peak", "volcano", "bay", "beach", "cape", "cliff", "water",
|
||
|
|
"wetland", "wood", "glacier", "valley", "strait", "reef",
|
||
|
|
"hot_spring", "geyser", "cave_entrance"},
|
||
|
|
"waterway": {"river", "stream", "waterfall", "dam", "canal", "rapids"},
|
||
|
|
"water": {"lake", "pond", "reservoir", "lagoon"},
|
||
|
|
"boundary": {"protected_area", "national_park", "administrative"},
|
||
|
|
"leisure": {"nature_reserve", "park"},
|
||
|
|
"mountain_pass": None,
|
||
|
|
"landuse": {"cemetery"},
|
||
|
|
"historic": None,
|
||
|
|
"tourism": {"attraction", "viewpoint"},
|
||
|
|
}
|
||
|
|
|
||
|
|
# Travel-relevant types (get Wikivoyage resolution)
|
||
|
|
TRAVEL_TYPES = {
|
||
|
|
("place", "city"), ("place", "town"), ("place", "state"),
|
||
|
|
("place", "country"), ("place", "island"),
|
||
|
|
("boundary", "national_park"), ("boundary", "protected_area"),
|
||
|
|
("leisure", "nature_reserve"), ("leisure", "park"),
|
||
|
|
("tourism", "attraction"),
|
||
|
|
}
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# LOGGING SETUP
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def setup_logging():
|
||
|
|
Path(LOG_DIR).mkdir(parents=True, exist_ok=True)
|
||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
|
log_file = f"{LOG_DIR}/wave2_{timestamp}.log"
|
||
|
|
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
|
|
handlers=[
|
||
|
|
logging.FileHandler(log_file),
|
||
|
|
logging.StreamHandler()
|
||
|
|
]
|
||
|
|
)
|
||
|
|
return logging.getLogger(__name__)
|
||
|
|
|
||
|
|
log = setup_logging()
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# MEMORY MONITORING
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def check_memory(context=""):
|
||
|
|
"""Check RSS memory usage, abort if over budget."""
|
||
|
|
rss_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
||
|
|
if sys.platform == 'darwin':
|
||
|
|
rss_mb = rss_kb / 1024 / 1024
|
||
|
|
else:
|
||
|
|
rss_mb = rss_kb / 1024
|
||
|
|
|
||
|
|
if rss_mb > MAX_RSS_MB:
|
||
|
|
log.error(f"RSS {rss_mb:.0f}MB exceeds {MAX_RSS_MB}MB budget at {context}, aborting")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
return rss_mb
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# CHECKPOINTS
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def write_checkpoint(path, *values):
|
||
|
|
"""Write checkpoint values to file."""
|
||
|
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with open(path, 'w') as f:
|
||
|
|
for v in values:
|
||
|
|
f.write(f"{v}\n")
|
||
|
|
log.info(f"Checkpoint written: {path}")
|
||
|
|
|
||
|
|
def read_checkpoint(path):
|
||
|
|
"""Read checkpoint values from file."""
|
||
|
|
if not Path(path).exists():
|
||
|
|
return None
|
||
|
|
with open(path, 'r') as f:
|
||
|
|
return [line.strip() for line in f.readlines()]
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# DATABASE
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def get_db():
|
||
|
|
return sqlite3.connect(DB_PATH)
|
||
|
|
|
||
|
|
def get_existing_place_names():
|
||
|
|
"""Get set of (place_name, osm_key, osm_value, county, state, country_code) from DB."""
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
c.execute("""
|
||
|
|
SELECT place_name, osm_key, osm_value,
|
||
|
|
COALESCE(county,''), COALESCE(state,''), country_code
|
||
|
|
FROM wiki_places
|
||
|
|
""")
|
||
|
|
existing = set(tuple(row) for row in c.fetchall())
|
||
|
|
conn.close()
|
||
|
|
return existing
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STAGE 1: EXTRACT (WIKIDATA ONLY, NO WIKIPEDIA)
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def should_include(osm_key, osm_value):
|
||
|
|
"""Check if this type should be included."""
|
||
|
|
if osm_key not in INCLUDE_KEYS:
|
||
|
|
return False
|
||
|
|
allowed = INCLUDE_KEYS[osm_key]
|
||
|
|
return allowed is None or osm_value in allowed
|
||
|
|
|
||
|
|
def extract_from_jsonl():
|
||
|
|
"""Extract places with extra.wikidata but NO extra.wikipedia from JSONL."""
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
# Load existing places from wave 1 to skip
|
||
|
|
log.info("Loading existing place names from DB...")
|
||
|
|
existing = get_existing_place_names()
|
||
|
|
log.info(f"Found {len(existing):,} existing places to skip")
|
||
|
|
|
||
|
|
# Check for checkpoint
|
||
|
|
checkpoint = read_checkpoint(EXTRACT_CHECKPOINT)
|
||
|
|
if checkpoint:
|
||
|
|
resume_line = int(checkpoint[0])
|
||
|
|
log.info(f"Resuming extraction from line {resume_line:,}")
|
||
|
|
else:
|
||
|
|
resume_line = COMBINED_START
|
||
|
|
|
||
|
|
seen = set()
|
||
|
|
inserted = 0
|
||
|
|
skipped_has_wikipedia = 0
|
||
|
|
skipped_no_wikidata = 0
|
||
|
|
skipped_filtered = 0
|
||
|
|
skipped_no_name = 0
|
||
|
|
skipped_dupe = 0
|
||
|
|
skipped_existing = 0
|
||
|
|
|
||
|
|
log.info(f"Wave 2 extraction: lines {resume_line:,} to {COMBINED_END:,}")
|
||
|
|
log.info("Filtering: wikidata present AND wikipedia absent")
|
||
|
|
|
||
|
|
with open(JSONL_PATH, 'rb') as fh:
|
||
|
|
dctx = zstd.ZstdDecompressor()
|
||
|
|
with dctx.stream_reader(fh) as reader:
|
||
|
|
import io
|
||
|
|
text_reader = io.TextIOWrapper(reader, encoding='utf-8')
|
||
|
|
|
||
|
|
line_num = 0
|
||
|
|
for line in text_reader:
|
||
|
|
line_num += 1
|
||
|
|
|
||
|
|
if line_num < resume_line:
|
||
|
|
if line_num % 10_000_000 == 0:
|
||
|
|
log.info(f" Seeking... line {line_num:,}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
if line_num > COMBINED_END:
|
||
|
|
break
|
||
|
|
|
||
|
|
if line_num % 1_000_000 == 0:
|
||
|
|
rss = check_memory(f"line {line_num}")
|
||
|
|
log.info(f" Line {line_num:,}, inserted {inserted:,}, RSS {rss:.0f}MB")
|
||
|
|
conn.commit()
|
||
|
|
write_checkpoint(EXTRACT_CHECKPOINT, line_num)
|
||
|
|
|
||
|
|
try:
|
||
|
|
record = json.loads(line)
|
||
|
|
content = record.get("content", [{}])[0]
|
||
|
|
|
||
|
|
country_code = content.get("country_code", "")
|
||
|
|
if country_code not in ("us", "ca"):
|
||
|
|
continue
|
||
|
|
|
||
|
|
osm_key = content.get("osm_key", "")
|
||
|
|
osm_value = content.get("osm_value", "")
|
||
|
|
|
||
|
|
if not should_include(osm_key, osm_value):
|
||
|
|
skipped_filtered += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
extra = content.get("extra", {})
|
||
|
|
|
||
|
|
# Wave 2: require wikidata, reject if has wikipedia
|
||
|
|
wikidata_id = extra.get("wikidata")
|
||
|
|
if not wikidata_id:
|
||
|
|
skipped_no_wikidata += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
wiki_tag = extra.get("wikipedia")
|
||
|
|
if wiki_tag:
|
||
|
|
skipped_has_wikipedia += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Get name
|
||
|
|
name_obj = content.get("name", {})
|
||
|
|
name = name_obj.get("name:en") or name_obj.get("name")
|
||
|
|
if not name:
|
||
|
|
skipped_no_name += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Parse address
|
||
|
|
address = content.get("address", {})
|
||
|
|
state = address.get("state") or address.get("state:en")
|
||
|
|
county = address.get("county") or address.get("county:en")
|
||
|
|
|
||
|
|
# Dedup key
|
||
|
|
dedup_key = (name, osm_key, osm_value,
|
||
|
|
county or "", state or "", country_code)
|
||
|
|
|
||
|
|
# Skip if already in DB from wave 1
|
||
|
|
if dedup_key in existing:
|
||
|
|
skipped_existing += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
if dedup_key in seen:
|
||
|
|
skipped_dupe += 1
|
||
|
|
continue
|
||
|
|
seen.add(dedup_key)
|
||
|
|
|
||
|
|
# Get other fields
|
||
|
|
osm_id = f"{content.get('object_type', '')}{content.get('object_id', '')}"
|
||
|
|
importance = content.get("importance")
|
||
|
|
|
||
|
|
extra_fields = {k: v for k, v in extra.items()
|
||
|
|
if k not in ("wikipedia", "wikidata")}
|
||
|
|
extra_json = json.dumps(extra_fields) if extra_fields else None
|
||
|
|
|
||
|
|
# Insert (no wikipedia_title yet - will resolve via Wikidata)
|
||
|
|
c.execute("""
|
||
|
|
INSERT OR IGNORE INTO wiki_places
|
||
|
|
(place_name, osm_key, osm_value, county, state, country_code,
|
||
|
|
wikidata_id, osm_id, importance, extra_json, source)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'photon_wave2')
|
||
|
|
""", (name, osm_key, osm_value, county, state, country_code,
|
||
|
|
wikidata_id, osm_id, importance, extra_json))
|
||
|
|
|
||
|
|
if c.rowcount > 0:
|
||
|
|
inserted += 1
|
||
|
|
|
||
|
|
if inserted % 10000 == 0:
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
continue
|
||
|
|
except Exception as e:
|
||
|
|
log.error(f"Error on line {line_num}: {e}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
if Path(EXTRACT_CHECKPOINT).exists():
|
||
|
|
Path(EXTRACT_CHECKPOINT).unlink()
|
||
|
|
log.info("Extract checkpoint cleared (completed)")
|
||
|
|
|
||
|
|
log.info(f"Wave 2 extraction complete:")
|
||
|
|
log.info(f" Inserted: {inserted:,}")
|
||
|
|
log.info(f" Skipped (has wikipedia): {skipped_has_wikipedia:,}")
|
||
|
|
log.info(f" Skipped (no wikidata): {skipped_no_wikidata:,}")
|
||
|
|
log.info(f" Skipped (filtered type): {skipped_filtered:,}")
|
||
|
|
log.info(f" Skipped (no name): {skipped_no_name:,}")
|
||
|
|
log.info(f" Skipped (duplicate): {skipped_dupe:,}")
|
||
|
|
log.info(f" Skipped (existing wave1): {skipped_existing:,}")
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STAGE 2: RESOLVE WIKIPEDIA TITLES VIA WIKIDATA
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def batch_wikidata_lookup(qids):
|
||
|
|
"""Batch lookup Wikidata Q-IDs to get wiki titles."""
|
||
|
|
if not qids:
|
||
|
|
return {}
|
||
|
|
|
||
|
|
params = {
|
||
|
|
"action": "wbgetentities",
|
||
|
|
"ids": "|".join(qids),
|
||
|
|
"props": "sitelinks",
|
||
|
|
"format": "json"
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
resp = requests.get(WIKIDATA_API, params=params, timeout=30)
|
||
|
|
resp.raise_for_status()
|
||
|
|
data = resp.json()
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
for qid, entity in data.get("entities", {}).items():
|
||
|
|
sitelinks = entity.get("sitelinks", {})
|
||
|
|
results[qid] = {
|
||
|
|
"enwiki": sitelinks.get("enwiki", {}).get("title"),
|
||
|
|
"enwikivoyage": sitelinks.get("enwikivoyage", {}).get("title")
|
||
|
|
}
|
||
|
|
return results
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
log.error(f"Wikidata API error: {e}")
|
||
|
|
return {}
|
||
|
|
|
||
|
|
def is_travel_type(osm_key, osm_value):
|
||
|
|
"""Check if this type should get Wikivoyage resolution."""
|
||
|
|
return (osm_key, osm_value) in TRAVEL_TYPES
|
||
|
|
|
||
|
|
def resolve_wikipedia_titles():
|
||
|
|
"""Resolve Wikipedia/Wikivoyage titles via Wikidata API for wave 2 records."""
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
# Get wave 2 records with wikidata_id but no wikipedia_title
|
||
|
|
c.execute("""
|
||
|
|
SELECT id, wikidata_id, osm_key, osm_value FROM wiki_places
|
||
|
|
WHERE source = 'photon_wave2'
|
||
|
|
AND wikidata_id IS NOT NULL
|
||
|
|
AND wikipedia_title IS NULL
|
||
|
|
""")
|
||
|
|
rows = c.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log.info("No wave 2 records need Wikipedia resolution")
|
||
|
|
return
|
||
|
|
|
||
|
|
log.info(f"Resolving Wikipedia titles for {len(rows):,} wave 2 records via Wikidata...")
|
||
|
|
|
||
|
|
resolved_wiki = 0
|
||
|
|
resolved_voyage = 0
|
||
|
|
|
||
|
|
for i in range(0, len(rows), WIKIDATA_BATCH_SIZE):
|
||
|
|
batch = rows[i:i + WIKIDATA_BATCH_SIZE]
|
||
|
|
qid_to_row = {row[1]: row for row in batch}
|
||
|
|
qids = list(qid_to_row.keys())
|
||
|
|
|
||
|
|
results = batch_wikidata_lookup(qids)
|
||
|
|
|
||
|
|
for qid, titles in results.items():
|
||
|
|
row = qid_to_row[qid]
|
||
|
|
row_id = row[0]
|
||
|
|
osm_key = row[2]
|
||
|
|
osm_value = row[3]
|
||
|
|
|
||
|
|
wiki_title = titles.get("enwiki")
|
||
|
|
voyage_title = titles.get("enwikivoyage")
|
||
|
|
|
||
|
|
if wiki_title:
|
||
|
|
wiki_title = wiki_title.replace(' ', '_')
|
||
|
|
c.execute("UPDATE wiki_places SET wikipedia_title = ? WHERE id = ?",
|
||
|
|
(wiki_title, row_id))
|
||
|
|
resolved_wiki += 1
|
||
|
|
|
||
|
|
# Also set wikivoyage if travel type
|
||
|
|
if voyage_title and is_travel_type(osm_key, osm_value):
|
||
|
|
voyage_title = voyage_title.replace(' ', '_')
|
||
|
|
c.execute("UPDATE wiki_places SET wikivoyage_title = ? WHERE id = ?",
|
||
|
|
(voyage_title, row_id))
|
||
|
|
resolved_voyage += 1
|
||
|
|
|
||
|
|
if ((i // WIKIDATA_BATCH_SIZE + 1) % 20) == 0:
|
||
|
|
log.info(f" Processed {i + len(batch):,}/{len(rows):,} - "
|
||
|
|
f"wiki: {resolved_wiki:,}, voyage: {resolved_voyage:,}")
|
||
|
|
conn.commit()
|
||
|
|
write_checkpoint(RESOLVE_CHECKPOINT, i + len(batch), resolved_wiki, resolved_voyage)
|
||
|
|
|
||
|
|
time.sleep(0.1) # Be nice to Wikidata API
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
if Path(RESOLVE_CHECKPOINT).exists():
|
||
|
|
Path(RESOLVE_CHECKPOINT).unlink()
|
||
|
|
|
||
|
|
log.info(f"Resolution complete:")
|
||
|
|
log.info(f" Wikipedia titles: {resolved_wiki:,}")
|
||
|
|
log.info(f" Wikivoyage titles: {resolved_voyage:,}")
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STAGE 3: VALIDATE
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def validate_title_worker(args):
|
||
|
|
"""Worker function for thread pool."""
|
||
|
|
row_id, title, base_url = args
|
||
|
|
if not title:
|
||
|
|
return (row_id, False)
|
||
|
|
|
||
|
|
title = title.replace(" ", "_")
|
||
|
|
|
||
|
|
if "NONE" in title.upper() or "(disambiguation" in title.lower() or len(title) > 100:
|
||
|
|
return (row_id, False)
|
||
|
|
|
||
|
|
url = f"{base_url}/A/{title}"
|
||
|
|
try:
|
||
|
|
resp = requests.head(url, allow_redirects=True, timeout=10)
|
||
|
|
return (row_id, resp.status_code == 200)
|
||
|
|
except Exception:
|
||
|
|
return (row_id, False)
|
||
|
|
|
||
|
|
def validate_wikipedia_titles():
|
||
|
|
"""Validate Wikipedia titles against ZIM for wave 2 records."""
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
SELECT id, wikipedia_title FROM wiki_places
|
||
|
|
WHERE source = 'photon_wave2'
|
||
|
|
AND wikipedia_title IS NOT NULL
|
||
|
|
AND wikipedia_exists IS NULL
|
||
|
|
""")
|
||
|
|
rows = c.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log.info("No wave 2 Wikipedia titles to validate")
|
||
|
|
return
|
||
|
|
|
||
|
|
log.info(f"Validating {len(rows):,} Wikipedia titles...")
|
||
|
|
|
||
|
|
valid_count = 0
|
||
|
|
invalid_count = 0
|
||
|
|
|
||
|
|
work_items = [(row_id, title, WIKIPEDIA_INTERNAL) for row_id, title in rows]
|
||
|
|
|
||
|
|
with ThreadPoolExecutor(max_workers=VALIDATION_WORKERS) as executor:
|
||
|
|
futures = {executor.submit(validate_title_worker, item): item for item in work_items}
|
||
|
|
|
||
|
|
for i, future in enumerate(as_completed(futures)):
|
||
|
|
result = future.result()
|
||
|
|
if result is None:
|
||
|
|
continue
|
||
|
|
row_id, exists = result
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
UPDATE wiki_places
|
||
|
|
SET wikipedia_exists = ?, zim_validated_at = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (1 if exists else 0, datetime.now().isoformat(), row_id))
|
||
|
|
|
||
|
|
if exists:
|
||
|
|
valid_count += 1
|
||
|
|
else:
|
||
|
|
invalid_count += 1
|
||
|
|
|
||
|
|
if (i + 1) % 1000 == 0:
|
||
|
|
log.info(f" Validated {i+1:,}/{len(rows):,} - valid: {valid_count:,}")
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
log.info(f"Wikipedia validation complete: valid={valid_count:,}, invalid={invalid_count:,}")
|
||
|
|
|
||
|
|
def validate_wikivoyage_titles():
|
||
|
|
"""Validate Wikivoyage titles against ZIM for wave 2 records."""
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
SELECT id, wikivoyage_title FROM wiki_places
|
||
|
|
WHERE source = 'photon_wave2'
|
||
|
|
AND wikivoyage_title IS NOT NULL
|
||
|
|
AND wikivoyage_exists IS NULL
|
||
|
|
""")
|
||
|
|
rows = c.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log.info("No wave 2 Wikivoyage titles to validate")
|
||
|
|
return
|
||
|
|
|
||
|
|
log.info(f"Validating {len(rows):,} Wikivoyage titles...")
|
||
|
|
|
||
|
|
valid_count = 0
|
||
|
|
invalid_count = 0
|
||
|
|
|
||
|
|
work_items = [(row_id, title, WIKIVOYAGE_INTERNAL) for row_id, title in rows]
|
||
|
|
|
||
|
|
with ThreadPoolExecutor(max_workers=VALIDATION_WORKERS) as executor:
|
||
|
|
futures = {executor.submit(validate_title_worker, item): item for item in work_items}
|
||
|
|
|
||
|
|
for i, future in enumerate(as_completed(futures)):
|
||
|
|
result = future.result()
|
||
|
|
if result is None:
|
||
|
|
continue
|
||
|
|
row_id, exists = result
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
UPDATE wiki_places
|
||
|
|
SET wikivoyage_exists = ?, zim_validated_at = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (1 if exists else 0, datetime.now().isoformat(), row_id))
|
||
|
|
|
||
|
|
if exists:
|
||
|
|
valid_count += 1
|
||
|
|
else:
|
||
|
|
invalid_count += 1
|
||
|
|
|
||
|
|
if (i + 1) % 500 == 0:
|
||
|
|
conn.commit()
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
log.info(f"Wikivoyage validation complete: valid={valid_count:,}, invalid={invalid_count:,}")
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STAGE 4: SUMMARY GENERATION
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def fetch_article_content(title, base_url, max_chars=8000):
|
||
|
|
"""Fetch and extract text content from ZIM article."""
|
||
|
|
if not title:
|
||
|
|
return None
|
||
|
|
|
||
|
|
title = title.replace(" ", "_")
|
||
|
|
|
||
|
|
if "NONE" in title.upper() or "(disambiguation" in title.lower() or len(title) > 100:
|
||
|
|
return None
|
||
|
|
|
||
|
|
url = f"{base_url}/A/{title}"
|
||
|
|
try:
|
||
|
|
resp = requests.get(url, timeout=30)
|
||
|
|
if resp.status_code != 200:
|
||
|
|
return None
|
||
|
|
|
||
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
|
|
|
||
|
|
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
||
|
|
tag.decompose()
|
||
|
|
|
||
|
|
text = soup.get_text(separator=' ', strip=True)
|
||
|
|
|
||
|
|
if len(text) > max_chars:
|
||
|
|
text = text[:max_chars] + "..."
|
||
|
|
|
||
|
|
return text
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
log.error(f"Error fetching {title}: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
def build_summary_prompt(place, wiki_content=None, voyage_content=None):
|
||
|
|
"""Build summary prompt."""
|
||
|
|
|
||
|
|
prompt_parts = [
|
||
|
|
"Generate a 3-4 sentence summary for a map app user who tapped on this location.",
|
||
|
|
"",
|
||
|
|
f"Place: {place['place_name']} ({place['osm_key']}:{place['osm_value']})",
|
||
|
|
f"Location: {place['county'] or 'N/A'}, {place['state'] or 'N/A'}, {place['country_code'].upper()}",
|
||
|
|
""
|
||
|
|
]
|
||
|
|
|
||
|
|
if wiki_content:
|
||
|
|
prompt_parts.extend([
|
||
|
|
f"=== WIKIPEDIA ARTICLE: {place['wikipedia_title']} ===",
|
||
|
|
wiki_content,
|
||
|
|
""
|
||
|
|
])
|
||
|
|
|
||
|
|
if voyage_content:
|
||
|
|
prompt_parts.extend([
|
||
|
|
f"=== WIKIVOYAGE ARTICLE: {place['wikivoyage_title']} ===",
|
||
|
|
voyage_content,
|
||
|
|
""
|
||
|
|
])
|
||
|
|
|
||
|
|
prompt_parts.extend([
|
||
|
|
"Instructions:",
|
||
|
|
"- If either article appears to be about a DIFFERENT place, ignore it and provide",
|
||
|
|
" the correct title if you know it.",
|
||
|
|
"",
|
||
|
|
"- Write based on place type:",
|
||
|
|
" * Settlements: what's notable, regional context, key attractions",
|
||
|
|
" * Natural features: terrain, activities, access, best season",
|
||
|
|
" * Parks/reserves: what you'll see, trails, camping, logistics",
|
||
|
|
" * Historic sites: significance, what remains, visiting info",
|
||
|
|
"",
|
||
|
|
"- Engaging but informative tone.",
|
||
|
|
"",
|
||
|
|
"Response format (REQUIRED):",
|
||
|
|
"WIKIPEDIA_TITLE: <verified or corrected title, or NONE>",
|
||
|
|
"WIKIVOYAGE_TITLE: <verified or corrected title, or NONE>",
|
||
|
|
"SUMMARY: <your 3-4 sentence summary>",
|
||
|
|
"POPULATION: <number if mentioned, or NONE>",
|
||
|
|
])
|
||
|
|
|
||
|
|
if is_travel_type(place['osm_key'], place['osm_value']) and not voyage_content:
|
||
|
|
prompt_parts.insert(-5, "")
|
||
|
|
prompt_parts.insert(-5, "If this place has a Wikivoyage article, include the title.")
|
||
|
|
|
||
|
|
return "\n".join(prompt_parts)
|
||
|
|
|
||
|
|
def parse_gemini_response(text):
|
||
|
|
"""Parse Gemini response."""
|
||
|
|
result = {
|
||
|
|
"wikipedia_title": None,
|
||
|
|
"wikivoyage_title": None,
|
||
|
|
"summary": None,
|
||
|
|
"population": None
|
||
|
|
}
|
||
|
|
|
||
|
|
lines = text.strip().split('\n')
|
||
|
|
summary_lines = []
|
||
|
|
in_summary = False
|
||
|
|
|
||
|
|
for line in lines:
|
||
|
|
line_stripped = line.strip()
|
||
|
|
|
||
|
|
if line_stripped.startswith("WIKIPEDIA_TITLE:"):
|
||
|
|
val = line_stripped.split(":", 1)[1].strip()
|
||
|
|
result["wikipedia_title"] = None if val.upper().startswith("NONE") else val
|
||
|
|
in_summary = False
|
||
|
|
elif line_stripped.startswith("WIKIVOYAGE_TITLE:"):
|
||
|
|
val = line_stripped.split(":", 1)[1].strip()
|
||
|
|
result["wikivoyage_title"] = None if val.upper().startswith("NONE") else val
|
||
|
|
in_summary = False
|
||
|
|
elif line_stripped.startswith("SUMMARY:"):
|
||
|
|
first_part = line_stripped.split(":", 1)[1].strip()
|
||
|
|
if first_part:
|
||
|
|
summary_lines.append(first_part)
|
||
|
|
in_summary = True
|
||
|
|
elif line_stripped.startswith("POPULATION:"):
|
||
|
|
in_summary = False
|
||
|
|
val = line_stripped.split(":", 1)[1].strip()
|
||
|
|
result["population"] = None if val.upper().startswith("NONE") else val
|
||
|
|
elif in_summary and line_stripped:
|
||
|
|
summary_lines.append(line_stripped)
|
||
|
|
|
||
|
|
if summary_lines:
|
||
|
|
result["summary"] = " ".join(summary_lines)
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
def log_gemini_response(place_id, prompt, response, parsed, output_tokens=None):
|
||
|
|
"""Log Gemini response."""
|
||
|
|
Path(GEMINI_LOG).parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
with open(GEMINI_LOG, 'a') as f:
|
||
|
|
f.write(json.dumps({
|
||
|
|
"timestamp": datetime.now().isoformat(),
|
||
|
|
"place_id": place_id,
|
||
|
|
"prompt_length": len(prompt),
|
||
|
|
"output_tokens": output_tokens,
|
||
|
|
"response": response,
|
||
|
|
"parsed": parsed
|
||
|
|
}) + "\n")
|
||
|
|
|
||
|
|
def is_rate_limit_error(error):
|
||
|
|
"""Check if error is a rate limit error."""
|
||
|
|
error_str = str(error).lower()
|
||
|
|
return (
|
||
|
|
"429" in error_str or
|
||
|
|
"resource_exhausted" in error_str or
|
||
|
|
("rate" in error_str and "limit" in error_str) or
|
||
|
|
"quota" in error_str
|
||
|
|
)
|
||
|
|
|
||
|
|
def summarize_worker(args):
|
||
|
|
"""Worker function for summary generation."""
|
||
|
|
place, client, circuit_breaker = args
|
||
|
|
|
||
|
|
wiki_content = None
|
||
|
|
voyage_content = None
|
||
|
|
|
||
|
|
if place["wikipedia_exists"]:
|
||
|
|
wiki_content = fetch_article_content(place["wikipedia_title"], WIKIPEDIA_INTERNAL)
|
||
|
|
|
||
|
|
if place["wikivoyage_exists"]:
|
||
|
|
voyage_content = fetch_article_content(place["wikivoyage_title"], WIKIVOYAGE_INTERNAL)
|
||
|
|
|
||
|
|
if not wiki_content and not voyage_content:
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": False,
|
||
|
|
"error": "no_content",
|
||
|
|
"error_message": f"No content for place {place['id']}"
|
||
|
|
}
|
||
|
|
|
||
|
|
prompt = build_summary_prompt(place, wiki_content, voyage_content)
|
||
|
|
|
||
|
|
if wiki_content and voyage_content:
|
||
|
|
summary_source = "wikipedia+wikivoyage"
|
||
|
|
elif wiki_content:
|
||
|
|
summary_source = "wikipedia"
|
||
|
|
else:
|
||
|
|
summary_source = "wikivoyage"
|
||
|
|
|
||
|
|
response_text = None
|
||
|
|
output_tokens = None
|
||
|
|
|
||
|
|
for attempt in range(MAX_RETRIES):
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
if circuit_breaker["abort"]:
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": False,
|
||
|
|
"error": "circuit_breaker_abort",
|
||
|
|
"error_message": "Circuit breaker aborted"
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
response = client.models.generate_content(
|
||
|
|
model=GEMINI_MODEL,
|
||
|
|
contents=prompt,
|
||
|
|
config=types.GenerateContentConfig(
|
||
|
|
temperature=0.3,
|
||
|
|
max_output_tokens=3000
|
||
|
|
)
|
||
|
|
)
|
||
|
|
response_text = response.text
|
||
|
|
|
||
|
|
if hasattr(response, 'usage_metadata') and response.usage_metadata:
|
||
|
|
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', None)
|
||
|
|
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
circuit_breaker["consecutive_429"] = 0
|
||
|
|
break
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
if is_rate_limit_error(e):
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
circuit_breaker["consecutive_429"] += 1
|
||
|
|
consecutive = circuit_breaker["consecutive_429"]
|
||
|
|
|
||
|
|
log.warning(f"Rate limit ({consecutive} consecutive) for {place['id']}: {e}")
|
||
|
|
|
||
|
|
if attempt < MAX_RETRIES - 1:
|
||
|
|
time.sleep(RETRY_DELAYS[attempt])
|
||
|
|
continue
|
||
|
|
else:
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
circuit_breaker["consecutive_429"] = 0
|
||
|
|
|
||
|
|
if attempt < MAX_RETRIES - 1:
|
||
|
|
log.warning(f"Gemini retry {attempt+1} for {place['id']}: {e}")
|
||
|
|
time.sleep(RETRY_DELAYS[attempt])
|
||
|
|
else:
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": False,
|
||
|
|
"error": type(e).__name__,
|
||
|
|
"error_message": str(e),
|
||
|
|
"is_rate_limit": is_rate_limit_error(e)
|
||
|
|
}
|
||
|
|
|
||
|
|
if not response_text:
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": False,
|
||
|
|
"error": "no_response",
|
||
|
|
"error_message": "No response from Gemini"
|
||
|
|
}
|
||
|
|
|
||
|
|
parsed = parse_gemini_response(response_text)
|
||
|
|
log_gemini_response(place['id'], prompt, response_text, parsed, output_tokens)
|
||
|
|
|
||
|
|
if not parsed["summary"]:
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": False,
|
||
|
|
"error": "parse_failed",
|
||
|
|
"error_message": "No summary parsed"
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
"place_id": place["id"],
|
||
|
|
"success": True,
|
||
|
|
"summary": parsed["summary"],
|
||
|
|
"summary_source": summary_source,
|
||
|
|
"population": parsed["population"],
|
||
|
|
"wikipedia_title": parsed["wikipedia_title"],
|
||
|
|
"wikivoyage_title": parsed["wikivoyage_title"]
|
||
|
|
}
|
||
|
|
|
||
|
|
def generate_summaries(dry_run=False, workers=None):
|
||
|
|
"""Generate summaries for wave 2 validated places."""
|
||
|
|
|
||
|
|
if workers is None:
|
||
|
|
workers = SUMMARIZE_WORKERS
|
||
|
|
|
||
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
||
|
|
if not api_key:
|
||
|
|
env_path = Path(__file__).parent / ".env"
|
||
|
|
if env_path.exists():
|
||
|
|
for line in env_path.read_text().splitlines():
|
||
|
|
if line.startswith("GEMINI_API_KEY="):
|
||
|
|
api_key = line.split("=", 1)[1].strip().strip('"\'')
|
||
|
|
break
|
||
|
|
|
||
|
|
if not api_key:
|
||
|
|
log.error("GEMINI_API_KEY not set")
|
||
|
|
return
|
||
|
|
|
||
|
|
client = genai.Client(api_key=api_key)
|
||
|
|
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
SELECT id, place_name, osm_key, osm_value, county, state, country_code,
|
||
|
|
wikipedia_title, wikivoyage_title, wikipedia_exists, wikivoyage_exists,
|
||
|
|
wikidata_id
|
||
|
|
FROM wiki_places
|
||
|
|
WHERE source = 'photon_wave2'
|
||
|
|
AND (wikipedia_exists = 1 OR wikivoyage_exists = 1)
|
||
|
|
AND summary IS NULL
|
||
|
|
ORDER BY id
|
||
|
|
""")
|
||
|
|
rows = c.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log.info("No wave 2 places need summaries")
|
||
|
|
return
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
rows = rows[:5]
|
||
|
|
log.info(f"DRY RUN: Processing only {len(rows)} places with {workers} workers")
|
||
|
|
else:
|
||
|
|
log.info(f"Generating summaries for {len(rows):,} wave 2 places with {workers} workers...")
|
||
|
|
|
||
|
|
places = []
|
||
|
|
for row in rows:
|
||
|
|
places.append({
|
||
|
|
"id": row[0],
|
||
|
|
"place_name": row[1],
|
||
|
|
"osm_key": row[2],
|
||
|
|
"osm_value": row[3],
|
||
|
|
"county": row[4],
|
||
|
|
"state": row[5],
|
||
|
|
"country_code": row[6],
|
||
|
|
"wikipedia_title": row[7],
|
||
|
|
"wikivoyage_title": row[8],
|
||
|
|
"wikipedia_exists": row[9],
|
||
|
|
"wikivoyage_exists": row[10],
|
||
|
|
"wikidata_id": row[11]
|
||
|
|
})
|
||
|
|
|
||
|
|
circuit_breaker = {
|
||
|
|
"lock": threading.Lock(),
|
||
|
|
"consecutive_429": 0,
|
||
|
|
"abort": False
|
||
|
|
}
|
||
|
|
circuit_breaker_paused = False
|
||
|
|
|
||
|
|
processed = 0
|
||
|
|
success = 0
|
||
|
|
errors = 0
|
||
|
|
last_place_id = 0
|
||
|
|
|
||
|
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||
|
|
futures = {
|
||
|
|
executor.submit(summarize_worker, (place, client, circuit_breaker)): place
|
||
|
|
for place in places
|
||
|
|
}
|
||
|
|
|
||
|
|
for future in as_completed(futures):
|
||
|
|
result = future.result()
|
||
|
|
processed += 1
|
||
|
|
last_place_id = result["place_id"]
|
||
|
|
|
||
|
|
if processed % 500 == 0:
|
||
|
|
rss = check_memory(f"summary {processed}")
|
||
|
|
log.info(f" Memory: RSS {rss:.0f}MB")
|
||
|
|
|
||
|
|
if result["success"]:
|
||
|
|
now = datetime.now().isoformat()
|
||
|
|
c.execute("""
|
||
|
|
UPDATE wiki_places SET
|
||
|
|
summary = ?,
|
||
|
|
summary_source = ?,
|
||
|
|
wiki_population = ?,
|
||
|
|
wikipedia_title = COALESCE(?, wikipedia_title),
|
||
|
|
wikivoyage_title = COALESCE(?, wikivoyage_title),
|
||
|
|
summary_generated_at = ?,
|
||
|
|
updated_at = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (
|
||
|
|
result["summary"],
|
||
|
|
result["summary_source"],
|
||
|
|
result["population"],
|
||
|
|
result["wikipedia_title"],
|
||
|
|
result["wikivoyage_title"],
|
||
|
|
now,
|
||
|
|
now,
|
||
|
|
result["place_id"]
|
||
|
|
))
|
||
|
|
success += 1
|
||
|
|
else:
|
||
|
|
if result["error"] != "circuit_breaker_abort":
|
||
|
|
log.warning(f"Failed {result['place_id']}: {result['error_message']}")
|
||
|
|
if result["error"] not in ("no_content", "parse_failed"):
|
||
|
|
c.execute("""
|
||
|
|
INSERT INTO wiki_failures (place_id, wave, stage, error_type, error_message)
|
||
|
|
VALUES (?, 2, 'summarize', ?, ?)
|
||
|
|
""", (result["place_id"], result["error"], result["error_message"]))
|
||
|
|
errors += 1
|
||
|
|
|
||
|
|
# Circuit breaker check
|
||
|
|
if result.get("is_rate_limit"):
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
if circuit_breaker["consecutive_429"] >= CIRCUIT_BREAKER_THRESHOLD:
|
||
|
|
if not circuit_breaker_paused:
|
||
|
|
log.warning(f"CIRCUIT BREAKER: Pausing {CIRCUIT_BREAKER_PAUSE//60} minutes...")
|
||
|
|
circuit_breaker_paused = True
|
||
|
|
circuit_breaker["consecutive_429"] = 0
|
||
|
|
time.sleep(CIRCUIT_BREAKER_PAUSE)
|
||
|
|
circuit_breaker_paused = False
|
||
|
|
log.info("CIRCUIT BREAKER: Resuming...")
|
||
|
|
|
||
|
|
if processed % 50 == 0:
|
||
|
|
conn.commit()
|
||
|
|
write_checkpoint(SUMMARIZE_CHECKPOINT, last_place_id, success, errors)
|
||
|
|
|
||
|
|
if processed % 100 == 0:
|
||
|
|
log.info(f" Processed {processed:,}/{len(places):,} - success: {success:,}, errors: {errors:,}")
|
||
|
|
|
||
|
|
with circuit_breaker["lock"]:
|
||
|
|
if circuit_breaker["abort"]:
|
||
|
|
log.error("Aborting due to circuit breaker")
|
||
|
|
break
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
if Path(SUMMARIZE_CHECKPOINT).exists():
|
||
|
|
Path(SUMMARIZE_CHECKPOINT).unlink()
|
||
|
|
|
||
|
|
log.info(f"Wave 2 summary generation complete:")
|
||
|
|
log.info(f" Success: {success:,}")
|
||
|
|
log.info(f" Errors: {errors:,}")
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STAGE 5: RE-VALIDATE CORRECTED TITLES
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def revalidate_corrected_titles():
|
||
|
|
"""Re-validate titles corrected by Gemini."""
|
||
|
|
conn = get_db()
|
||
|
|
c = conn.cursor()
|
||
|
|
|
||
|
|
c.execute("""
|
||
|
|
SELECT id, wikipedia_title, wikivoyage_title FROM wiki_places
|
||
|
|
WHERE source = 'photon_wave2'
|
||
|
|
AND summary_generated_at IS NOT NULL
|
||
|
|
AND (zim_validated_at IS NULL OR zim_validated_at < summary_generated_at)
|
||
|
|
""")
|
||
|
|
rows = c.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
log.info("No wave 2 corrected titles need re-validation")
|
||
|
|
return
|
||
|
|
|
||
|
|
log.info(f"Re-validating {len(rows):,} wave 2 corrected titles...")
|
||
|
|
|
||
|
|
wiki_revalidated = 0
|
||
|
|
voyage_revalidated = 0
|
||
|
|
|
||
|
|
for row_id, wiki_title, voyage_title in rows:
|
||
|
|
now = datetime.now().isoformat()
|
||
|
|
|
||
|
|
if wiki_title:
|
||
|
|
result = validate_title_worker((row_id, wiki_title, WIKIPEDIA_INTERNAL))
|
||
|
|
if result:
|
||
|
|
_, exists = result
|
||
|
|
c.execute("""
|
||
|
|
UPDATE wiki_places SET wikipedia_exists = ?, zim_validated_at = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (1 if exists else 0, now, row_id))
|
||
|
|
wiki_revalidated += 1
|
||
|
|
|
||
|
|
if voyage_title:
|
||
|
|
result = validate_title_worker((row_id, voyage_title, WIKIVOYAGE_INTERNAL))
|
||
|
|
if result:
|
||
|
|
_, exists = result
|
||
|
|
c.execute("""
|
||
|
|
UPDATE wiki_places SET wikivoyage_exists = ?, zim_validated_at = ?
|
||
|
|
WHERE id = ?
|
||
|
|
""", (1 if exists else 0, now, row_id))
|
||
|
|
voyage_revalidated += 1
|
||
|
|
|
||
|
|
if (wiki_revalidated + voyage_revalidated) % 500 == 0:
|
||
|
|
conn.commit()
|
||
|
|
log.info(f" Re-validated {wiki_revalidated + voyage_revalidated:,}")
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
log.info(f"Re-validation complete: {wiki_revalidated:,} Wikipedia, {voyage_revalidated:,} Wikivoyage")
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# MAIN
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print(__doc__)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
command = sys.argv[1].lower()
|
||
|
|
|
||
|
|
if command == "extract":
|
||
|
|
log.info("=== WAVE 2 STAGE 1: EXTRACT ===")
|
||
|
|
extract_from_jsonl()
|
||
|
|
|
||
|
|
elif command == "resolve":
|
||
|
|
log.info("=== WAVE 2 STAGE 2: RESOLVE ===")
|
||
|
|
resolve_wikipedia_titles()
|
||
|
|
|
||
|
|
elif command == "validate":
|
||
|
|
log.info("=== WAVE 2 STAGE 3: VALIDATE ===")
|
||
|
|
validate_wikipedia_titles()
|
||
|
|
validate_wikivoyage_titles()
|
||
|
|
|
||
|
|
elif command == "summarize":
|
||
|
|
dry_run = "--dry-run" in sys.argv
|
||
|
|
workers = None
|
||
|
|
for arg in sys.argv:
|
||
|
|
if arg.startswith("--workers="):
|
||
|
|
workers = int(arg.split("=")[1])
|
||
|
|
if dry_run:
|
||
|
|
log.info("=== WAVE 2 STAGE 4: SUMMARIZE (DRY RUN) ===")
|
||
|
|
else:
|
||
|
|
log.info("=== WAVE 2 STAGE 4: SUMMARIZE ===")
|
||
|
|
generate_summaries(dry_run=dry_run, workers=workers)
|
||
|
|
|
||
|
|
elif command == "revalidate":
|
||
|
|
log.info("=== WAVE 2 STAGE 5: RE-VALIDATE ===")
|
||
|
|
revalidate_corrected_titles()
|
||
|
|
|
||
|
|
elif command == "all":
|
||
|
|
log.info("=== WAVE 2: ALL STAGES ===")
|
||
|
|
log.info("=== STAGE 1: EXTRACT ===")
|
||
|
|
extract_from_jsonl()
|
||
|
|
log.info("=== STAGE 2: RESOLVE ===")
|
||
|
|
resolve_wikipedia_titles()
|
||
|
|
log.info("=== STAGE 3: VALIDATE ===")
|
||
|
|
validate_wikipedia_titles()
|
||
|
|
validate_wikivoyage_titles()
|
||
|
|
log.info("=== STAGE 4: SUMMARIZE ===")
|
||
|
|
generate_summaries()
|
||
|
|
log.info("=== STAGE 5: RE-VALIDATE ===")
|
||
|
|
revalidate_corrected_titles()
|
||
|
|
|
||
|
|
else:
|
||
|
|
print(f"Unknown command: {command}")
|
||
|
|
print(__doc__)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|