mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-06-10 08:54:34 +02:00
Compare commits
104 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
e840a119dd |
|||
|
6365fe6756 |
|||
|
ac99723e51 |
|||
|
21c0f11eff |
|||
|
879df84b7a |
|||
|
aa6e972260 |
|||
|
79d7b2b343 |
|||
|
adee6d5a69 |
|||
|
86c902f7b5 |
|||
|
1f05d4b4d6 |
|||
|
d7292c4cc7 |
|||
|
d56b1d5f92 |
|||
|
c968497b94 |
|||
|
ed36eec85e |
|||
|
14ad2cd34a |
|||
|
f42b1fef3b |
|||
|
cf74f1840b |
|||
|
25cf5ac16a |
|||
| bb220b7ba3 | |||
| 75664c7d02 | |||
|
f7a501b4d7 |
|||
| dcd4ddd358 | |||
| f67f4ec9e3 | |||
|
dc7591b101 |
|||
| 484dfbd1e0 | |||
|
573347a2ee |
|||
| 3d2d69cd56 | |||
|
a80bb6e848 |
|||
| f276b95753 | |||
| c1ba1f8dc7 | |||
| a04c10ad55 | |||
| d8f84ab55a | |||
| b4e33eb048 | |||
| 05c24f95f6 | |||
| 686b35710a | |||
| cf758476b4 | |||
| 87a4741b8d | |||
| 58347415bc | |||
| ff0721c23e | |||
| 2252905986 | |||
| bc463188d5 | |||
| 1a9dfc8f8d | |||
| 3293cb4238 | |||
| e0eedcedfd | |||
| 26d4bc7478 | |||
| f2a0f81580 | |||
| 227affca9d | |||
| fa456fecb1 | |||
| 83a21854c3 | |||
| b741e217f6 | |||
| 991826b4f1 | |||
| 121eb45b44 | |||
| b5de9c6e39 | |||
| 2387a96a1e | |||
| e9c9cee4f3 | |||
| 07e6d0460b | |||
| 4f96d8f6fe | |||
| 2ed9335f4e | |||
| f35af18320 | |||
| 63b68bfea7 | |||
| 15c58a69ac | |||
| 829bc87b7b | |||
| 9c5b0520f9 | |||
| 3280e34718 | |||
| a4288c0cd8 | |||
| 095bf8c2af | |||
| 620f99c762 | |||
| d460f0e202 | |||
| 65693d15aa | |||
| 2121ee4936 | |||
| 64605b38bb | |||
| e6b81db520 | |||
| d4c5c371ca | |||
| ac69e2761d | |||
| 87b230dcba | |||
| c76d63b785 | |||
| a14501347b | |||
| dfab388769 | |||
| 23483e8198 | |||
| 3243f2f252 | |||
| 9841c38011 | |||
| a9510b5ed9 | |||
| c5283ece3e | |||
| 5f5bcedab9 | |||
| 9692044790 | |||
| b035ba3f20 | |||
| 76076fc4ab | |||
| 8945c82e3f | |||
| f0b160ef7c | |||
| 45c3bb8d56 | |||
| 1ce9a3731f | |||
| 45b954fccc | |||
| 125602fa13 | |||
| da50e5f0b8 | |||
| 491a4350fc | |||
| b250d0c257 | |||
| a40ce47127 | |||
| fed02186fa | |||
| 6f2a1d206e | |||
| 501004ecf1 | |||
| 2635160887 | |||
| c60aa5e80d | |||
| 7c1af0f063 | |||
| 8d54ff165d |
31 changed files with 3309 additions and 12 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -24,3 +24,7 @@ recon.db
|
||||||
|
|
||||||
# OS
|
# OS
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
|
# Kiwix binary tools (installed from tarball)
|
||||||
|
bin/
|
||||||
|
status.db
|
||||||
|
|
|
||||||
12
config.yaml
12
config.yaml
|
|
@ -413,6 +413,14 @@ peertube:
|
||||||
rate_limit_delay: 0.5 # Delay between video ingestions (seconds)
|
rate_limit_delay: 0.5 # Delay between video ingestions (seconds)
|
||||||
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
||||||
|
|
||||||
|
scraper:
|
||||||
|
workspace: /opt/recon/data/scraper # Working directory (tmp dirs for Zimit output)
|
||||||
|
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
|
||||||
|
default_language: eng # ISO 639-3 language code for ZIM metadata
|
||||||
|
poll_interval: 300 # Seconds between checking for pending scrape jobs
|
||||||
|
docker_image: ghcr.io/openzim/zimit # Zimit Docker image for web crawling
|
||||||
|
docker_workers: 2 # Concurrent crawl workers inside Zimit container
|
||||||
|
|
||||||
# Stream B: New Library Pipeline
|
# Stream B: New Library Pipeline
|
||||||
new_pipeline:
|
new_pipeline:
|
||||||
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
|
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
|
||||||
|
|
@ -440,3 +448,7 @@ pipeline:
|
||||||
text: text_processor
|
text: text_processor
|
||||||
# mtime stability threshold for picking up files from acquired/
|
# mtime stability threshold for picking up files from acquired/
|
||||||
mtime_stability_seconds: 10
|
mtime_stability_seconds: 10
|
||||||
|
# Language filter: skip non-English content before Gemini enrichment
|
||||||
|
language_filter: true # Enable langdetect-based filtering
|
||||||
|
allowed_languages: # ISO 639-1 codes allowed through enrichment
|
||||||
|
- en
|
||||||
|
|
|
||||||
18
config/address_book.yaml
Normal file
18
config/address_book.yaml
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
# RECON Address Book — saved locations for navigation shortcuts.
|
||||||
|
# Entries are matched by name and aliases (case-insensitive).
|
||||||
|
# Add new entries by appending to the list below.
|
||||||
|
|
||||||
|
entries:
|
||||||
|
- id: home
|
||||||
|
name: Home
|
||||||
|
aliases:
|
||||||
|
- home
|
||||||
|
- matt's house
|
||||||
|
- 214 north st
|
||||||
|
- 214 north street
|
||||||
|
address: "214 North St, Filer, ID 83328"
|
||||||
|
lat: 42.5735833
|
||||||
|
lon: -114.6066389
|
||||||
|
tags:
|
||||||
|
- residence
|
||||||
|
- primary
|
||||||
67
config/profiles/home.yaml
Normal file
67
config/profiles/home.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
# Deployment profile: Home (VM 1130)
|
||||||
|
# Active on the main Echo6 deployment. Full stack with planet-scale NA tiles.
|
||||||
|
# Override via RECON_PROFILE env var in /etc/systemd/system/recon.service
|
||||||
|
|
||||||
|
profile: home
|
||||||
|
region_name: "North America"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/planet/current.pmtiles"
|
||||||
|
bounds: [-168, 14, -52, 72]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/planet-dem.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
place_details:
|
||||||
|
local_source: "nominatim"
|
||||||
|
local_bbox: [-125.0, 31.3, -104.0, 49.0]
|
||||||
|
fallback_source: "overpass"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: true
|
||||||
|
has_kiwix_wiki: true
|
||||||
|
has_hillshade: true
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: true
|
||||||
|
has_landclass: true
|
||||||
|
has_public_lands_layer: true
|
||||||
|
has_contours: true
|
||||||
|
has_contours_test: false
|
||||||
|
has_contours_test_10ft: false
|
||||||
|
has_address_book_write: false
|
||||||
|
has_overture_enrichment: true
|
||||||
|
has_google_places_enrichment: true
|
||||||
|
has_contacts: true
|
||||||
|
has_wiki_rewriting: true
|
||||||
|
has_wiki_discovery: false
|
||||||
|
has_usfs_trails: true
|
||||||
|
has_blm_trails: true
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [42.5736, -114.6066]
|
||||||
|
zoom: 10
|
||||||
|
|
||||||
|
# Offroute wilderness routing
|
||||||
|
offroute:
|
||||||
|
osm_pbf_path: "/mnt/nav/sources/idaho-latest.osm.pbf"
|
||||||
|
densify_interval_m: 100
|
||||||
|
postgis_dsn: "dbname=padus"
|
||||||
|
|
||||||
51
config/profiles/minimal_pi.yaml
Normal file
51
config/profiles/minimal_pi.yaml
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
# Deployment profile: Minimal Pi (single-state pocket deployment)
|
||||||
|
# Template for the lightest possible field kit — Idaho only.
|
||||||
|
# Override via RECON_PROFILE env var.
|
||||||
|
|
||||||
|
profile: minimal_pi
|
||||||
|
region_name: "Idaho"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/idaho.pmtiles"
|
||||||
|
bounds: [-117.5, 42.0, -111.0, 49.0]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/hillshade-idaho.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
# TODO(matt): confirm logout next= host for this profile
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: false
|
||||||
|
has_kiwix_wiki: false
|
||||||
|
has_hillshade: false
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: false
|
||||||
|
has_landclass: false
|
||||||
|
has_public_lands_layer: false
|
||||||
|
has_address_book_write: true
|
||||||
|
has_overture_enrichment: false
|
||||||
|
has_google_places_enrichment: false
|
||||||
|
has_contacts: false
|
||||||
|
has_wiki_rewriting: false
|
||||||
|
has_wiki_discovery: false
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [44.0, -114.0]
|
||||||
|
zoom: 7
|
||||||
59
config/profiles/regional_pi.yaml
Normal file
59
config/profiles/regional_pi.yaml
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
# Deployment profile: Regional Pi (multi-state field kit)
|
||||||
|
# Template for a Raspberry Pi covering Idaho + surrounding states.
|
||||||
|
# Override via RECON_PROFILE env var.
|
||||||
|
|
||||||
|
profile: regional_pi
|
||||||
|
region_name: "Idaho + Neighbors"
|
||||||
|
|
||||||
|
tileset:
|
||||||
|
url: "/tiles/regional.pmtiles"
|
||||||
|
bounds: [-125, 40, -104, 49]
|
||||||
|
max_zoom: 15
|
||||||
|
attribution: "Protomaps © OSM"
|
||||||
|
|
||||||
|
tileset_hillshade:
|
||||||
|
url: "/tiles/hillshade-regional.pmtiles"
|
||||||
|
encoding: "terrarium"
|
||||||
|
max_zoom: 12
|
||||||
|
|
||||||
|
traffic:
|
||||||
|
provider: "tomtom"
|
||||||
|
proxy_url: "/api/traffic/flow/{z}/{x}/{y}.png"
|
||||||
|
|
||||||
|
place_details:
|
||||||
|
local_source: "nominatim"
|
||||||
|
local_bbox: [-125.0, 40.0, -104.0, 49.0]
|
||||||
|
fallback_source: "overpass"
|
||||||
|
|
||||||
|
services:
|
||||||
|
geocode: "/api/geocode"
|
||||||
|
reverse: "/api/reverse"
|
||||||
|
address_book: "/api/address_book"
|
||||||
|
valhalla: "/valhalla"
|
||||||
|
|
||||||
|
# TODO(matt): confirm logout next= host for this profile
|
||||||
|
auth:
|
||||||
|
login_url: "/outpost.goauthentik.io/start?rd=%2F"
|
||||||
|
logout_url: "https://auth.echo6.co/if/flow/default-invalidation-flow/?next=https://navi.echo6.co/"
|
||||||
|
|
||||||
|
features:
|
||||||
|
has_nominatim_details: true
|
||||||
|
has_kiwix_wiki: false
|
||||||
|
has_hillshade: true
|
||||||
|
has_3d_terrain: false
|
||||||
|
has_traffic_overlay: true
|
||||||
|
has_landclass: true
|
||||||
|
has_public_lands_layer: true
|
||||||
|
has_contours: true
|
||||||
|
has_contours_test: true
|
||||||
|
has_contours_test_10ft: true
|
||||||
|
has_address_book_write: true
|
||||||
|
has_overture_enrichment: false
|
||||||
|
has_google_places_enrichment: false
|
||||||
|
has_contacts: false
|
||||||
|
has_wiki_rewriting: true
|
||||||
|
has_wiki_discovery: false
|
||||||
|
|
||||||
|
defaults:
|
||||||
|
center: [44.0, -114.0]
|
||||||
|
zoom: 7
|
||||||
563
lib/api.py
563
lib/api.py
|
|
@ -35,12 +35,33 @@ _cache = {
|
||||||
'qdrant_scroll': None,
|
'qdrant_scroll': None,
|
||||||
'qdrant_scroll_ts': 0,
|
'qdrant_scroll_ts': 0,
|
||||||
'quick_stats': None,
|
'quick_stats': None,
|
||||||
|
'kiwix_sources': None,
|
||||||
}
|
}
|
||||||
|
|
||||||
app = Flask(__name__,
|
app = Flask(__name__,
|
||||||
template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'),
|
template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'),
|
||||||
static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static'))
|
static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static'))
|
||||||
|
|
||||||
|
app.config['MAX_CONTENT_LENGTH'] = None # ZIM files can be multi-GB
|
||||||
|
|
||||||
|
|
||||||
|
# ── Large ZIM upload support ──
|
||||||
|
# Override stream factory so ZIM uploads write directly to /mnt/kiwix/
|
||||||
|
# instead of /tmp (which is on the 96GB root disk and can't hold 100GB+ ZIMs).
|
||||||
|
from flask import Request as _FlaskRequest
|
||||||
|
|
||||||
|
class _LargeZimRequest(_FlaskRequest):
|
||||||
|
def _get_file_stream(self, total_content_length, content_type, filename=None, content_length=None):
|
||||||
|
if filename and filename.lower().endswith('.zim'):
|
||||||
|
return tempfile.NamedTemporaryFile('wb+', dir='/mnt/kiwix', prefix='.upload_', suffix='.tmp', delete=False)
|
||||||
|
return super()._get_file_stream(total_content_length, content_type, filename, content_length)
|
||||||
|
|
||||||
|
app.request_class = _LargeZimRequest
|
||||||
|
# ── Netsyms Blueprint ──
|
||||||
|
from .netsyms_api import netsyms_bp
|
||||||
|
app.register_blueprint(netsyms_bp)
|
||||||
|
|
||||||
|
|
||||||
# ── Navigation Constants ──
|
# ── Navigation Constants ──
|
||||||
|
|
||||||
KNOWLEDGE_SUBNAV = [
|
KNOWLEDGE_SUBNAV = [
|
||||||
|
|
@ -56,6 +77,11 @@ PEERTUBE_SUBNAV = [
|
||||||
{'href': '/peertube/channels', 'label': 'Channels'},
|
{'href': '/peertube/channels', 'label': 'Channels'},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
KIWIX_SUBNAV = [
|
||||||
|
{'href': '/kiwix', 'label': 'Library'},
|
||||||
|
{'href': '/kiwix/scraper', 'label': 'Scraper'},
|
||||||
|
]
|
||||||
SETTINGS_SUBNAV = [
|
SETTINGS_SUBNAV = [
|
||||||
{'href': '/settings/keys', 'label': 'API Keys'},
|
{'href': '/settings/keys', 'label': 'API Keys'},
|
||||||
{'href': '/settings/cookies', 'label': 'YouTube Cookies'},
|
{'href': '/settings/cookies', 'label': 'YouTube Cookies'},
|
||||||
|
|
@ -908,6 +934,7 @@ def _build_knowledge_stats():
|
||||||
c.source,
|
c.source,
|
||||||
CASE
|
CASE
|
||||||
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
|
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
|
||||||
|
WHEN c.source = 'kiwix' THEN 'wiki'
|
||||||
WHEN c.path LIKE 'http%' THEN 'web'
|
WHEN c.path LIKE 'http%' THEN 'web'
|
||||||
ELSE 'pdf'
|
ELSE 'pdf'
|
||||||
END as type,
|
END as type,
|
||||||
|
|
@ -967,6 +994,7 @@ def _build_knowledge_stats():
|
||||||
d.status, d.concepts_extracted, d.vectors_inserted,
|
d.status, d.concepts_extracted, d.vectors_inserted,
|
||||||
CASE
|
CASE
|
||||||
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
|
WHEN c.source = 'stream.echo6.co' THEN 'transcript'
|
||||||
|
WHEN c.source = 'kiwix' THEN 'wiki'
|
||||||
WHEN d.path LIKE 'http%' THEN 'web'
|
WHEN d.path LIKE 'http%' THEN 'web'
|
||||||
ELSE 'pdf'
|
ELSE 'pdf'
|
||||||
END as type
|
END as type
|
||||||
|
|
@ -1072,6 +1100,12 @@ def start_cache_warmer(stop_event=None):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" Quick stats warm-up failed: {e}")
|
logger.warning(f" Quick stats warm-up failed: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
logger.info(" Kiwix sources cached")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" Kiwix sources warm-up failed: {e}")
|
||||||
|
|
||||||
logger.info("Cache warmer ready — all data pre-loaded")
|
logger.info("Cache warmer ready — all data pre-loaded")
|
||||||
|
|
||||||
# Continuous refresh loop
|
# Continuous refresh loop
|
||||||
|
|
@ -1098,6 +1132,10 @@ def start_cache_warmer(stop_event=None):
|
||||||
_cache['quick_stats'] = _build_quick_stats()
|
_cache['quick_stats'] = _build_quick_stats()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# PeerTube dashboard: every 30s (cycle 2, offset)
|
# PeerTube dashboard: every 30s (cycle 2, offset)
|
||||||
if cycle % 2 == 1:
|
if cycle % 2 == 1:
|
||||||
|
|
@ -1281,6 +1319,9 @@ def api_keys_reload():
|
||||||
return jsonify({'count': count})
|
return jsonify({'count': count})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ── YouTube Cookie Management ──
|
# ── YouTube Cookie Management ──
|
||||||
|
|
||||||
PEERTUBE_HOST = '192.168.1.170'
|
PEERTUBE_HOST = '192.168.1.170'
|
||||||
|
|
@ -1930,6 +1971,528 @@ def api_peertube_dashboard():
|
||||||
return jsonify(_cache['pt_dashboard'])
|
return jsonify(_cache['pt_dashboard'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── Kiwix Dashboard ──
|
||||||
|
|
||||||
|
@app.route('/kiwix')
|
||||||
|
def kiwix_dashboard():
|
||||||
|
return render_template('kiwix/dashboard.html',
|
||||||
|
domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/kiwix/scraper')
|
||||||
|
def kiwix_scraper():
|
||||||
|
return render_template('kiwix/scraper.html',
|
||||||
|
domain='kiwix', subnav=KIWIX_SUBNAV, active_page='/kiwix/scraper')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/kiwix/sources')
|
||||||
|
def api_kiwix_sources():
|
||||||
|
"""Serve pre-cached Kiwix sources data (never blocks)."""
|
||||||
|
if _cache['kiwix_sources'] is None:
|
||||||
|
return jsonify({'error': 'Warming up, try again in a few seconds'}), 503
|
||||||
|
return jsonify(_cache['kiwix_sources'])
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/kiwix/toggle-ingest/<int:source_id>', methods=['POST'])
|
||||||
|
def api_kiwix_toggle_ingest(source_id):
|
||||||
|
"""Toggle ingest_enabled on a ZIM source."""
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute("SELECT id, status, ingest_enabled FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
|
||||||
|
if not row:
|
||||||
|
return jsonify({'error': 'Source not found'}), 404
|
||||||
|
|
||||||
|
data = request.get_json(silent=True) or {}
|
||||||
|
new_val = 1 if data.get('enabled', not row['ingest_enabled']) else 0
|
||||||
|
conn.execute("UPDATE zim_sources SET ingest_enabled = ? WHERE id = ?", (new_val, source_id))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# If toggling ON and source is eligible, spawn ingest in background
|
||||||
|
if new_val == 1 and row['status'] == 'detected':
|
||||||
|
_spawn_zim_ingest(source_id)
|
||||||
|
|
||||||
|
return jsonify({'ok': True, 'ingest_enabled': new_val})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/kiwix/trigger-ingest/<int:source_id>', methods=['POST'])
|
||||||
|
def api_kiwix_trigger_ingest(source_id):
|
||||||
|
"""Explicit one-shot ingest trigger."""
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute("SELECT id FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
|
||||||
|
if not row:
|
||||||
|
return jsonify({'error': 'Source not found'}), 404
|
||||||
|
|
||||||
|
_spawn_zim_ingest(source_id)
|
||||||
|
return jsonify({'ok': True})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/kiwix/upload', methods=['POST'])
|
||||||
|
def api_kiwix_upload():
|
||||||
|
"""Accept ZIM file upload, register with kiwix-serve, scan."""
|
||||||
|
import subprocess
|
||||||
|
if 'file' not in request.files:
|
||||||
|
return jsonify({'error': 'No file provided'}), 400
|
||||||
|
|
||||||
|
f = request.files['file']
|
||||||
|
if not f.filename or not f.filename.endswith('.zim'):
|
||||||
|
return jsonify({'error': 'File must be a .zim file'}), 400
|
||||||
|
|
||||||
|
filename = secure_filename(f.filename)
|
||||||
|
dest = os.path.join('/mnt/kiwix', filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Stream was written directly to /mnt/kiwix/ by _LargeZimRequest —
|
||||||
|
# rename in-place instead of copying 100GB+ through f.save()
|
||||||
|
if hasattr(f.stream, 'name') and f.stream.name:
|
||||||
|
tmp_path = f.stream.name
|
||||||
|
f.stream.close()
|
||||||
|
os.rename(tmp_path, dest)
|
||||||
|
else:
|
||||||
|
tmp_dest = dest + '.tmp'
|
||||||
|
f.save(tmp_dest)
|
||||||
|
os.rename(tmp_dest, dest)
|
||||||
|
except Exception as e:
|
||||||
|
# Clean up any temp files on failure
|
||||||
|
for p in [locals().get('tmp_path', ''), locals().get('tmp_dest', '')]:
|
||||||
|
if p and os.path.exists(p):
|
||||||
|
os.remove(p)
|
||||||
|
return jsonify({'error': f'Save failed: {e}'}), 500
|
||||||
|
|
||||||
|
# Register with kiwix-serve library
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'add', dest],
|
||||||
|
capture_output=True, text=True, timeout=30
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"kiwix-manage add failed: {e}")
|
||||||
|
|
||||||
|
# Scan for new entry (retry — monitorLibrary may need a moment to reload)
|
||||||
|
import time as _time
|
||||||
|
from .zim_monitor import scan_zims
|
||||||
|
for attempt in range(3):
|
||||||
|
try:
|
||||||
|
scan_zims()
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"scan_zims attempt {attempt+1} failed: {e}")
|
||||||
|
_time.sleep(2)
|
||||||
|
|
||||||
|
# Refresh cache
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return jsonify({'ok': True, 'filename': filename})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _full_zim_cleanup(source_id):
|
||||||
|
"""Full ZIM cleanup: Qdrant vectors, DB records, kiwix-manage, SIGHUP, file delete.
|
||||||
|
Returns dict with results. Caller handles cache refresh."""
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
import requests as req
|
||||||
|
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
|
||||||
|
zim_source = dict(row)
|
||||||
|
zim_filename = zim_source['zim_filename']
|
||||||
|
zim_path = zim_source['zim_path']
|
||||||
|
zim_title = zim_source.get('title', zim_filename)
|
||||||
|
results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False, 'scrape_jobs_deleted': 0}
|
||||||
|
|
||||||
|
# Step 1: Find all document hashes for this ZIM source
|
||||||
|
doc_hashes = [r['hash'] for r in conn.execute(
|
||||||
|
"SELECT c.hash FROM catalogue c WHERE c.source = 'kiwix' AND c.category = ?",
|
||||||
|
(zim_title,)
|
||||||
|
).fetchall()]
|
||||||
|
|
||||||
|
# Step 2: Delete vectors from Qdrant
|
||||||
|
if doc_hashes:
|
||||||
|
config = get_config()
|
||||||
|
qdrant_host = config.get('vector_db', {}).get('host', '100.64.0.14')
|
||||||
|
qdrant_port = config.get('vector_db', {}).get('port', 6333)
|
||||||
|
collection = config.get('vector_db', {}).get('collection', 'recon_knowledge')
|
||||||
|
|
||||||
|
# Delete in batches of 100 hashes
|
||||||
|
for i in range(0, len(doc_hashes), 100):
|
||||||
|
batch = doc_hashes[i:i+100]
|
||||||
|
try:
|
||||||
|
resp = req.post(
|
||||||
|
f"http://{qdrant_host}:{qdrant_port}/collections/{collection}/points/delete",
|
||||||
|
json={
|
||||||
|
"filter": {
|
||||||
|
"must": [{
|
||||||
|
"key": "doc_hash",
|
||||||
|
"match": {"any": batch}
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
results['vectors_deleted'] += len(batch)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Qdrant delete batch failed: {e}")
|
||||||
|
|
||||||
|
# Step 3: Delete DB records
|
||||||
|
for h in doc_hashes:
|
||||||
|
# Delete processing directory if it exists
|
||||||
|
text_dir_row = conn.execute("SELECT text_dir FROM documents WHERE hash = ?", (h,)).fetchone()
|
||||||
|
if text_dir_row and text_dir_row['text_dir']:
|
||||||
|
try:
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(text_dir_row['text_dir'], ignore_errors=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
conn.execute("DELETE FROM documents WHERE hash = ?", (h,))
|
||||||
|
conn.execute("DELETE FROM catalogue WHERE hash = ?", (h,))
|
||||||
|
results['docs_deleted'] = len(doc_hashes)
|
||||||
|
|
||||||
|
# Delete zim_articles records
|
||||||
|
conn.execute("DELETE FROM zim_articles WHERE zim_source_id = ?", (source_id,))
|
||||||
|
|
||||||
|
# Delete zim_sources record
|
||||||
|
conn.execute("DELETE FROM zim_sources WHERE id = ?", (source_id,))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Step 4: Remove from kiwix-serve library
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"kiwix-manage remove failed: {e}")
|
||||||
|
|
||||||
|
# Step 4b: SIGHUP kiwix-serve to reload library
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
pid = int(result.stdout.strip().split()[0])
|
||||||
|
os.kill(pid, signal.SIGHUP)
|
||||||
|
logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to signal kiwix-serve: {e}")
|
||||||
|
|
||||||
|
# Step 5: Delete the ZIM file
|
||||||
|
if os.path.isfile(zim_path):
|
||||||
|
try:
|
||||||
|
os.remove(zim_path)
|
||||||
|
results['file_deleted'] = True
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"ZIM file delete failed: {e}")
|
||||||
|
results['file_deleted'] = False
|
||||||
|
|
||||||
|
# Step 6: Delete any linked scrape_jobs rows
|
||||||
|
try:
|
||||||
|
res = conn.execute("DELETE FROM scrape_jobs WHERE zim_source_id = ?", (source_id,))
|
||||||
|
conn.commit()
|
||||||
|
results['scrape_jobs_deleted'] = res.rowcount
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"scrape_jobs cleanup failed: {e}")
|
||||||
|
|
||||||
|
logger.info(f"Full ZIM cleanup for source {source_id} ('{zim_title}'): {results}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST'])
|
||||||
|
def api_kiwix_remove(source_id):
|
||||||
|
"""Remove a ZIM source: delete vectors, DB records, library entry, and file."""
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
|
||||||
|
if not row:
|
||||||
|
return jsonify({'error': 'Source not found'}), 404
|
||||||
|
|
||||||
|
results = _full_zim_cleanup(source_id)
|
||||||
|
if results is None:
|
||||||
|
return jsonify({'error': 'Source not found during cleanup'}), 404
|
||||||
|
|
||||||
|
# Refresh cache
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return jsonify({'ok': True, 'results': results})
|
||||||
|
|
||||||
|
|
||||||
|
def _spawn_zim_ingest(source_id):
|
||||||
|
"""Start ZIM ingestion in a background thread."""
|
||||||
|
def _run():
|
||||||
|
try:
|
||||||
|
from .processors.zim_processor import ingest_zim
|
||||||
|
config = get_config()
|
||||||
|
db = StatusDB()
|
||||||
|
logger.info(f"Starting ZIM ingest for source {source_id}")
|
||||||
|
result = ingest_zim(source_id, db, config)
|
||||||
|
logger.info(f"ZIM ingest complete for source {source_id}: {result}")
|
||||||
|
# Refresh cache after completion
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"ZIM ingest failed for source {source_id}: {e}")
|
||||||
|
|
||||||
|
t = threading.Thread(target=_run, daemon=True, name=f'zim-ingest-{source_id}')
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_kiwix_sources():
|
||||||
|
"""Build Kiwix sources data for the dashboard cache."""
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
|
||||||
|
# Get all ZIM sources
|
||||||
|
rows = conn.execute("""
|
||||||
|
SELECT id, zim_filename, title, description, language, category,
|
||||||
|
article_count, status, processed_count, skipped_count, error_count,
|
||||||
|
ingest_enabled, detected_at, started_at, completed_at
|
||||||
|
FROM zim_sources
|
||||||
|
ORDER BY detected_at DESC
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
sources = []
|
||||||
|
total_articles = 0
|
||||||
|
total_processed = 0
|
||||||
|
total_in_pipeline = 0
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
source = dict(r)
|
||||||
|
zim_title = r['title'] or r['zim_filename']
|
||||||
|
total_articles += r['article_count'] or 0
|
||||||
|
total_processed += r['processed_count'] or 0
|
||||||
|
|
||||||
|
# Get pipeline stats for THIS source's documents (filtered by category)
|
||||||
|
pipeline = {}
|
||||||
|
try:
|
||||||
|
pipe_rows = conn.execute("""
|
||||||
|
SELECT d.status, COUNT(*) as cnt
|
||||||
|
FROM documents d
|
||||||
|
JOIN catalogue c ON d.hash = c.hash
|
||||||
|
WHERE c.source = 'kiwix' AND c.category = ?
|
||||||
|
GROUP BY d.status
|
||||||
|
""", (zim_title,)).fetchall()
|
||||||
|
for pr in pipe_rows:
|
||||||
|
pipeline[pr['status']] = pr['cnt']
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
in_pipe = sum(v for k, v in pipeline.items() if k not in ('complete', 'failed'))
|
||||||
|
total_in_pipeline += in_pipe
|
||||||
|
source['pipeline'] = pipeline
|
||||||
|
|
||||||
|
# Compute effective status reflecting full pipeline state
|
||||||
|
db_status = r['status']
|
||||||
|
if db_status == 'complete' and pipeline:
|
||||||
|
if in_pipe > 0:
|
||||||
|
source['effective_status'] = 'processing'
|
||||||
|
else:
|
||||||
|
source['effective_status'] = 'complete'
|
||||||
|
elif db_status == 'ingesting':
|
||||||
|
source['effective_status'] = 'extracting'
|
||||||
|
else:
|
||||||
|
source['effective_status'] = db_status # 'detected'
|
||||||
|
|
||||||
|
sources.append(source)
|
||||||
|
|
||||||
|
# Check kiwix-serve health
|
||||||
|
kiwix_status = 'inactive'
|
||||||
|
try:
|
||||||
|
resp = urllib.request.urlopen("http://localhost:8430", timeout=3)
|
||||||
|
if resp.status == 200:
|
||||||
|
kiwix_status = 'active'
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
'sources': sources,
|
||||||
|
'kiwix_serve': {'status': kiwix_status, 'url': 'https://wiki.echo6.co'},
|
||||||
|
'totals': {
|
||||||
|
'sources': len(sources),
|
||||||
|
'articles': total_articles,
|
||||||
|
'processed': total_processed,
|
||||||
|
'in_pipeline': total_in_pipeline,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ── Scraper API ──
|
||||||
|
|
||||||
|
@app.route('/api/scraper/submit', methods=['POST'])
|
||||||
|
def api_scraper_submit():
|
||||||
|
"""Submit a new web scrape job."""
|
||||||
|
data = request.get_json(silent=True) or {}
|
||||||
|
url = (data.get('url') or '').strip()
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
return jsonify({'error': 'url is required'}), 400
|
||||||
|
if not url.startswith(('http://', 'https://')):
|
||||||
|
return jsonify({'error': 'URL must start with http:// or https://'}), 400
|
||||||
|
|
||||||
|
config = get_config()
|
||||||
|
scraper_cfg = config.get('scraper', {})
|
||||||
|
language = data.get('language') or scraper_cfg.get('default_language', 'eng')
|
||||||
|
title = data.get('title', '').strip() or None
|
||||||
|
category = data.get('category', '').strip() or None
|
||||||
|
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO scrape_jobs (url, title, language, category, crawl_mode) VALUES (?, ?, ?, ?, ?)",
|
||||||
|
(url, title, language, category, 'zimit')
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
job_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
||||||
|
|
||||||
|
logger.info(f"Scraper job {job_id} submitted: {url}")
|
||||||
|
return jsonify({'ok': True, 'job_id': job_id}), 201
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/scraper/jobs')
|
||||||
|
def api_scraper_jobs():
|
||||||
|
"""List scrape jobs, optionally filtered by status."""
|
||||||
|
status_filter = request.args.get('status')
|
||||||
|
db = StatusDB()
|
||||||
|
jobs = db.get_scrape_jobs(status=status_filter)
|
||||||
|
return jsonify({'jobs': jobs})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/scraper/cancel/<int:job_id>', methods=['POST'])
|
||||||
|
def api_scraper_cancel(job_id):
|
||||||
|
"""Cancel a scrape job."""
|
||||||
|
|
||||||
|
db = StatusDB()
|
||||||
|
job = db.get_scrape_job(job_id)
|
||||||
|
if not job:
|
||||||
|
return jsonify({'error': 'Job not found'}), 404
|
||||||
|
|
||||||
|
if job['status'] in ('complete', 'cancelled'):
|
||||||
|
return jsonify({'error': f"Job already {job['status']}"}), 400
|
||||||
|
|
||||||
|
# Set cancelled in DB — the runner loop checks this between phases
|
||||||
|
db.update_scrape_job(job_id, status='cancelled')
|
||||||
|
|
||||||
|
# Stop the Docker container if running
|
||||||
|
container_name = f'recon-scraper-{job_id}'
|
||||||
|
try:
|
||||||
|
import subprocess as _subprocess
|
||||||
|
_subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logger.info(f"Scraper job {job_id} cancelled")
|
||||||
|
return jsonify({'ok': True})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/scraper/retry/<int:job_id>', methods=['POST'])
|
||||||
|
def api_scraper_retry(job_id):
|
||||||
|
"""Retry a failed or cancelled scrape job."""
|
||||||
|
db = StatusDB()
|
||||||
|
job = db.get_scrape_job(job_id)
|
||||||
|
if not job:
|
||||||
|
return jsonify({'error': 'Job not found'}), 404
|
||||||
|
|
||||||
|
if job['status'] not in ('failed', 'cancelled'):
|
||||||
|
return jsonify({'error': f"Job status is '{job['status']}', can only retry failed or cancelled jobs"}), 400
|
||||||
|
|
||||||
|
db.update_scrape_job(job_id,
|
||||||
|
status='pending',
|
||||||
|
error_message=None,
|
||||||
|
subprocess_pid=None,
|
||||||
|
crawl_mode=None,
|
||||||
|
started_at=None,
|
||||||
|
completed_at=None)
|
||||||
|
|
||||||
|
logger.info(f"Scraper job {job_id} reset to pending for retry")
|
||||||
|
return jsonify({'ok': True})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/scraper/delete/<int:job_id>', methods=['POST'])
|
||||||
|
def api_scraper_delete(job_id):
|
||||||
|
"""Delete a scrape job and clean up any associated ZIM artifacts."""
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
|
||||||
|
db = StatusDB()
|
||||||
|
job = db.get_scrape_job(job_id)
|
||||||
|
if not job:
|
||||||
|
return jsonify({'error': 'Job not found'}), 404
|
||||||
|
|
||||||
|
if job['status'] == 'running':
|
||||||
|
return jsonify({'error': 'Cannot delete a running job \u2014 cancel it first'}), 400
|
||||||
|
|
||||||
|
zim_cleanup_results = None
|
||||||
|
|
||||||
|
# If the job has a linked zim_source, do full cleanup
|
||||||
|
if job.get('zim_source_id'):
|
||||||
|
zim_cleanup_results = _full_zim_cleanup(job['zim_source_id'])
|
||||||
|
try:
|
||||||
|
_cache['kiwix_sources'] = _build_kiwix_sources()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
elif job.get('zim_filename'):
|
||||||
|
# No zim_source row, but there may be an orphan file + library entry
|
||||||
|
zim_path = os.path.join('/mnt/kiwix', job['zim_filename'])
|
||||||
|
if os.path.isfile(zim_path):
|
||||||
|
try:
|
||||||
|
os.remove(zim_path)
|
||||||
|
logger.info(f"Deleted orphan ZIM file: {zim_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to delete orphan ZIM file {zim_path}: {e}")
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove',
|
||||||
|
job['zim_filename'].replace('.zim', '')],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"kiwix-manage remove failed for orphan: {e}")
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
pid = int(result.stdout.strip().split()[0])
|
||||||
|
os.kill(pid, signal.SIGHUP)
|
||||||
|
logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to signal kiwix-serve: {e}")
|
||||||
|
|
||||||
|
# Delete the scrape_jobs row (may already be gone if _full_zim_cleanup deleted it)
|
||||||
|
conn = db._get_conn()
|
||||||
|
conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
logger.info(f"Scraper job {job_id} deleted (zim_cleanup={zim_cleanup_results})")
|
||||||
|
return jsonify({'ok': True, 'zim_cleanup': zim_cleanup_results})
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/api/scraper/clear-failed', methods=['POST'])
|
||||||
|
def api_scraper_clear_failed():
|
||||||
|
"""Delete all failed and cancelled scrape jobs."""
|
||||||
|
db = StatusDB()
|
||||||
|
conn = db._get_conn()
|
||||||
|
result = conn.execute("DELETE FROM scrape_jobs WHERE status IN ('failed', 'cancelled')")
|
||||||
|
conn.commit()
|
||||||
|
count = result.rowcount
|
||||||
|
logger.info(f"Cleared {count} failed/cancelled scraper jobs")
|
||||||
|
return jsonify({'ok': True, 'deleted': count})
|
||||||
|
|
||||||
|
|
||||||
# ── Metrics API ──
|
# ── Metrics API ──
|
||||||
|
|
||||||
@app.route('/api/metrics/history')
|
@app.route('/api/metrics/history')
|
||||||
|
|
|
||||||
117
lib/aurora_nav_tool.py
Normal file
117
lib/aurora_nav_tool.py
Normal file
|
|
@ -0,0 +1,117 @@
|
||||||
|
"""
|
||||||
|
title: Navigation
|
||||||
|
author: Echo6
|
||||||
|
version: 1.1.0
|
||||||
|
description: Turn-by-turn directions and geocoding via Photon + Valhalla on recon-vm. Supports driving, walking, cycling, and truck routing with worldwide coverage (281M places).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
_COORD_RE = re.compile(r'^(-?\d+\.?\d*)\s*,\s*(-?\d+\.?\d*)$')
|
||||||
|
|
||||||
|
|
||||||
|
class Tools:
|
||||||
|
class Valves(BaseModel):
|
||||||
|
photon_url: str = Field(
|
||||||
|
default="http://100.64.0.24:2322",
|
||||||
|
description="Photon geocoding service URL (recon-vm)",
|
||||||
|
)
|
||||||
|
valhalla_url: str = Field(
|
||||||
|
default="http://100.64.0.24:8002",
|
||||||
|
description="Valhalla routing service URL (recon-vm)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.valves = self.Valves()
|
||||||
|
|
||||||
|
def _geocode(self, query: str):
|
||||||
|
m = _COORD_RE.match(query.strip())
|
||||||
|
if m:
|
||||||
|
lat, lon = float(m.group(1)), float(m.group(2))
|
||||||
|
return lat, lon, query
|
||||||
|
resp = requests.get(
|
||||||
|
f"{self.valves.photon_url}/api",
|
||||||
|
params={"q": query, "limit": 1},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
features = resp.json().get("features", [])
|
||||||
|
if not features:
|
||||||
|
return None, None, None
|
||||||
|
props = features[0]["properties"]
|
||||||
|
coords = features[0]["geometry"]["coordinates"]
|
||||||
|
parts = [props.get("name", "")]
|
||||||
|
for key in ("city", "state", "country"):
|
||||||
|
v = props.get(key)
|
||||||
|
if v and v != parts[-1]:
|
||||||
|
parts.append(v)
|
||||||
|
return coords[1], coords[0], ", ".join(p for p in parts if p)
|
||||||
|
|
||||||
|
def get_directions(
|
||||||
|
self,
|
||||||
|
origin: str,
|
||||||
|
destination: str,
|
||||||
|
mode: str = "auto",
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Get turn-by-turn directions between two locations. When this tool returns results, present the directions exactly as returned — do not summarize or rephrase. Include all steps.
|
||||||
|
|
||||||
|
:param origin: Starting location — address, place name, or lat,lon coordinates
|
||||||
|
:param destination: Destination — address, place name, or lat,lon coordinates
|
||||||
|
:param mode: Travel mode: auto, pedestrian, bicycle, or truck (default: auto)
|
||||||
|
:return: Formatted turn-by-turn directions
|
||||||
|
"""
|
||||||
|
if mode not in ("auto", "pedestrian", "bicycle", "truck"):
|
||||||
|
mode = "auto"
|
||||||
|
|
||||||
|
orig_lat, orig_lon, orig_name = self._geocode(origin)
|
||||||
|
if orig_lat is None:
|
||||||
|
return f"Could not find location: {origin}"
|
||||||
|
|
||||||
|
dest_lat, dest_lon, dest_name = self._geocode(destination)
|
||||||
|
if dest_lat is None:
|
||||||
|
return f"Could not find location: {destination}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{self.valves.valhalla_url}/route",
|
||||||
|
json={
|
||||||
|
"locations": [
|
||||||
|
{"lat": orig_lat, "lon": orig_lon},
|
||||||
|
{"lat": dest_lat, "lon": dest_lon},
|
||||||
|
],
|
||||||
|
"costing": mode,
|
||||||
|
"directions_options": {"units": "miles"},
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
except requests.RequestException:
|
||||||
|
return "Navigation service unavailable"
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return "No route found between locations"
|
||||||
|
|
||||||
|
trip = resp.json()["trip"]
|
||||||
|
summary = trip["summary"]
|
||||||
|
legs = trip["legs"][0]["maneuvers"]
|
||||||
|
|
||||||
|
miles = round(summary["length"], 1)
|
||||||
|
minutes = round(summary["time"] / 60, 1)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"Directions from {orig_name} to {dest_name} ({mode}):",
|
||||||
|
f"Distance: {miles} miles | Time: {minutes} minutes",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
for i, m in enumerate(legs, 1):
|
||||||
|
inst = m["instruction"]
|
||||||
|
dist = m.get("length", 0)
|
||||||
|
if dist > 0:
|
||||||
|
lines.append(f"{i}. {inst} — {round(dist, 1)} mi")
|
||||||
|
else:
|
||||||
|
lines.append(f"{i}. {inst}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
22
lib/auth.py
Normal file
22
lib/auth.py
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
RECON Auth Helper — extract user identity from Authentik forward-auth headers.
|
||||||
|
"""
|
||||||
|
from functools import wraps
|
||||||
|
from flask import request, jsonify
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_id():
|
||||||
|
"""Return X-Authentik-Username or None."""
|
||||||
|
return request.headers.get('X-Authentik-Username')
|
||||||
|
|
||||||
|
|
||||||
|
def require_auth(f):
|
||||||
|
"""Decorator: 401 if no Authentik auth header."""
|
||||||
|
@wraps(f)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
user_id = get_user_id()
|
||||||
|
if not user_id:
|
||||||
|
return jsonify({'error': 'Authentication required'}), 401
|
||||||
|
request.user_id = user_id
|
||||||
|
return f(*args, **kwargs)
|
||||||
|
return wrapper
|
||||||
54
lib/deployment_config.py
Normal file
54
lib/deployment_config.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
Deployment profile loader.
|
||||||
|
|
||||||
|
Reads RECON_PROFILE env var (default: "home"), loads the matching YAML
|
||||||
|
from config/profiles/<profile>.yaml, and caches the parsed dict in memory.
|
||||||
|
|
||||||
|
Exposes get_deployment_config() as the in-process accessor for the profile.
|
||||||
|
|
||||||
|
Note: its former consumers (the /api/landclass gate, google_places,
|
||||||
|
place_detail, offroute/router) were all extracted to navi-* services or removed
|
||||||
|
across cleanups #4–#6/#27 — recon has no remaining caller of
|
||||||
|
get_deployment_config() today; the module is retained per cleanup #1.
|
||||||
|
(The former /api/config HTTP endpoint that served this dict to the frontend was
|
||||||
|
removed once navi-config (:8422) took over that route.)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.deployment_config')
|
||||||
|
|
||||||
|
_config_cache = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_deployment_config():
|
||||||
|
"""Load and cache the deployment profile. Called once at import time."""
|
||||||
|
global _config_cache
|
||||||
|
|
||||||
|
profile = os.environ.get('RECON_PROFILE', 'home')
|
||||||
|
config_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'config', 'profiles')
|
||||||
|
config_path = os.path.join(config_dir, f'{profile}.yaml')
|
||||||
|
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Deployment profile '{profile}' not found at {config_path}. "
|
||||||
|
f"Available profiles: {', '.join(f.replace('.yaml','') for f in os.listdir(config_dir) if f.endswith('.yaml'))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
_config_cache = yaml.safe_load(f)
|
||||||
|
|
||||||
|
logger.info(f"Loaded deployment profile: {profile} ({_config_cache.get('region_name', 'unknown')})")
|
||||||
|
return _config_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_deployment_config():
|
||||||
|
"""Return the cached deployment config dict."""
|
||||||
|
if _config_cache is None:
|
||||||
|
load_deployment_config()
|
||||||
|
return _config_cache
|
||||||
|
|
||||||
|
|
||||||
|
# Load on import so startup fails fast if profile is missing
|
||||||
|
load_deployment_config()
|
||||||
|
|
@ -10,6 +10,7 @@ Dependencies: requests, qdrant-client
|
||||||
Config: embedding, vector_db, processing.embed_workers
|
Config: embedding, vector_db, processing.embed_workers
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
@ -290,7 +291,17 @@ def embed_single(file_hash, db, config):
|
||||||
page_timestamps = meta['page_timestamps']
|
page_timestamps = meta['page_timestamps']
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if doc.get('path'):
|
# For ZIM articles, build wiki.echo6.co URL from meta.json
|
||||||
|
if source_type == 'zim' and meta.get('article_path'):
|
||||||
|
from urllib.parse import quote as url_quote
|
||||||
|
zim_name = meta.get('zim_name', '')
|
||||||
|
if not zim_name:
|
||||||
|
# Derive from zim_file: strip only .zim extension, keep full name
|
||||||
|
zf = meta.get('zim_file', '')
|
||||||
|
zim_name = zf.removesuffix('.zim')
|
||||||
|
article_path = url_quote(meta['article_path'], safe='/:@!$&()*+,;=-._~')
|
||||||
|
download_url = f'https://wiki.echo6.co/content/{zim_name}/{article_path}'
|
||||||
|
elif doc.get('path'):
|
||||||
download_url = generate_download_url(
|
download_url = generate_download_url(
|
||||||
doc['path'], config.get('library_root', '/mnt/library')
|
doc['path'], config.get('library_root', '/mnt/library')
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,15 @@ from .utils import get_config, setup_logging
|
||||||
from .status import StatusDB
|
from .status import StatusDB
|
||||||
from .utils import resolve_text_dir
|
from .utils import resolve_text_dir
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langdetect import detect as _detect_lang
|
||||||
|
from langdetect.lang_detect_exception import LangDetectException
|
||||||
|
_HAS_LANGDETECT = True
|
||||||
|
except ImportError:
|
||||||
|
_HAS_LANGDETECT = False
|
||||||
|
|
||||||
|
ALLOWED_LANGUAGES = {'en'} # Default: English only
|
||||||
|
|
||||||
logger = setup_logging('recon.enricher')
|
logger = setup_logging('recon.enricher')
|
||||||
|
|
||||||
# Docs stuck in "enriching" longer than this get reset to "extracted" for retry
|
# Docs stuck in "enriching" longer than this get reset to "extracted" for retry
|
||||||
|
|
@ -341,6 +350,42 @@ def validate_and_fix_concepts(concepts, key, config):
|
||||||
return concepts
|
return concepts
|
||||||
|
|
||||||
|
|
||||||
|
def _check_language(text_dir, config):
|
||||||
|
"""Check language of document text. Returns (is_allowed, detected_lang).
|
||||||
|
|
||||||
|
Reads first 1000 chars from first page file and uses langdetect.
|
||||||
|
Returns (True, lang) if language is allowed, (False, lang) if not.
|
||||||
|
Falls back to (True, 'unknown') if detection fails (benefit of the doubt).
|
||||||
|
"""
|
||||||
|
if not _HAS_LANGDETECT:
|
||||||
|
return True, 'unknown'
|
||||||
|
|
||||||
|
# Check if language filter is enabled in config
|
||||||
|
pipeline_cfg = config.get('pipeline', {})
|
||||||
|
if not pipeline_cfg.get('language_filter', True):
|
||||||
|
return True, 'disabled'
|
||||||
|
|
||||||
|
allowed = set(pipeline_cfg.get('allowed_languages', ['en']))
|
||||||
|
|
||||||
|
# Read first page for detection
|
||||||
|
page_files = sorted([f for f in os.listdir(text_dir)
|
||||||
|
if f.startswith('page_') and f.endswith('.txt')])
|
||||||
|
if not page_files:
|
||||||
|
return True, 'no_pages'
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(os.path.join(text_dir, page_files[0]), encoding='utf-8') as f:
|
||||||
|
sample = f.read(1500)
|
||||||
|
if len(sample.strip()) < 50:
|
||||||
|
return True, 'too_short'
|
||||||
|
lang = _detect_lang(sample)
|
||||||
|
return (lang in allowed), lang
|
||||||
|
except LangDetectException:
|
||||||
|
return True, 'detection_failed'
|
||||||
|
except Exception:
|
||||||
|
return True, 'error'
|
||||||
|
|
||||||
|
|
||||||
def enrich_single(file_hash, db, config, key_rotator):
|
def enrich_single(file_hash, db, config, key_rotator):
|
||||||
doc = db.get_document(file_hash)
|
doc = db.get_document(file_hash)
|
||||||
if not doc:
|
if not doc:
|
||||||
|
|
@ -359,6 +404,14 @@ def enrich_single(file_hash, db, config, key_rotator):
|
||||||
db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
|
db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Language gate: skip non-English documents before burning Gemini quota
|
||||||
|
lang_ok, detected_lang = _check_language(text_dir, config)
|
||||||
|
if not lang_ok:
|
||||||
|
logger.info(f"Skipping {file_hash[:12]}... detected language '{detected_lang}' "
|
||||||
|
f"(allowed: {config.get('pipeline', {}).get('allowed_languages', ['en'])})")
|
||||||
|
db.mark_failed(file_hash, f"Language filter: detected '{detected_lang}', not in allowed list")
|
||||||
|
return False
|
||||||
|
|
||||||
db.update_status(file_hash, 'enriching')
|
db.update_status(file_hash, 'enriching')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ Config: processing.extract_workers, processing.max_pdf_size_mb,
|
||||||
processing.extract_timeout, processing.page_timeout
|
processing.extract_timeout, processing.page_timeout
|
||||||
"""
|
"""
|
||||||
import base64
|
import base64
|
||||||
|
import re
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
@ -99,6 +100,40 @@ def _is_transient(error_str):
|
||||||
return any(sig in s for sig in transient_signals)
|
return any(sig in s for sig in transient_signals)
|
||||||
|
|
||||||
|
|
||||||
|
def _text_quality_ok(text, min_length=50):
|
||||||
|
"""Check if extracted text meets quality thresholds.
|
||||||
|
|
||||||
|
Beyond the basic length check, validates:
|
||||||
|
- Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars)
|
||||||
|
- Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count
|
||||||
|
|
||||||
|
Returns True if text passes all checks.
|
||||||
|
"""
|
||||||
|
text = text.strip()
|
||||||
|
if len(text) < min_length:
|
||||||
|
return False
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
if not words:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Word-like ratio: tokens with 2+ alphabetic characters
|
||||||
|
word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2)
|
||||||
|
word_ratio = word_like / len(words)
|
||||||
|
if word_ratio < 0.60:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Concatenation detector: lowercase immediately followed by uppercase
|
||||||
|
# Filter out common camelCase patterns in code (short tokens)
|
||||||
|
concat_hits = len(re.findall(r'[a-z][A-Z]', text))
|
||||||
|
concat_ratio = concat_hits / len(words) if words else 0
|
||||||
|
if concat_ratio > 0.10:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30):
|
def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30):
|
||||||
"""Render a single PDF page to PNG bytes using pdftoppm.
|
"""Render a single PDF page to PNG bytes using pdftoppm.
|
||||||
|
|
||||||
|
|
@ -224,7 +259,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
# Method 1: pdftotext (poppler)
|
# Method 1: pdftotext (poppler)
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pdftotext', '-f', str(page_num_0indexed + 1),
|
['pdftotext', '-layout', '-f', str(page_num_0indexed + 1),
|
||||||
'-l', str(page_num_0indexed + 1), pdf_path, '-'],
|
'-l', str(page_num_0indexed + 1), pdf_path, '-'],
|
||||||
capture_output=True, text=True, timeout=page_timeout
|
capture_output=True, text=True, timeout=page_timeout
|
||||||
)
|
)
|
||||||
|
|
@ -233,7 +268,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pdftotext'
|
return text, 'pdftotext'
|
||||||
|
|
||||||
# Method 2: pdftoppm + Tesseract OCR
|
# Method 2: pdftoppm + Tesseract OCR
|
||||||
|
|
@ -258,7 +293,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'tesseract'
|
return text, 'tesseract'
|
||||||
|
|
||||||
# Method 3: Gemini Vision (last resort)
|
# Method 3: Gemini Vision (last resort)
|
||||||
|
|
@ -276,8 +311,26 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
|
||||||
# ── Core extraction functions ──
|
# ── Core extraction functions ──
|
||||||
|
|
||||||
def _pypdf2_extract(reader, page_num):
|
def _pypdf2_extract(reader, page_num):
|
||||||
"""Extract text from a PyPDF2 page object. Runs inside a thread for timeout."""
|
"""Extract text from a PyPDF2 page object. Runs inside a thread for timeout.
|
||||||
return reader.pages[page_num].extract_text() or ''
|
|
||||||
|
Tries default extraction first (space_width=200). If quality check fails,
|
||||||
|
retries with space_width=100 which better detects word boundaries in
|
||||||
|
tightly-kerned PDFs (common in Haynes/workshop manuals).
|
||||||
|
|
||||||
|
Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter
|
||||||
|
controls word-boundary detection tolerance. Lower values = more aggressive
|
||||||
|
space insertion between characters.
|
||||||
|
"""
|
||||||
|
text = reader.pages[page_num].extract_text() or ''
|
||||||
|
if _text_quality_ok(text):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Retry with tighter word-boundary detection
|
||||||
|
text_tight = reader.pages[page_num].extract_text(space_width=100.0) or ''
|
||||||
|
if len(text_tight.strip()) >= len(text.strip()):
|
||||||
|
return text_tight
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
|
|
@ -302,13 +355,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
text = ''
|
text = ''
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pypdf2'
|
return text, 'pypdf2'
|
||||||
|
|
||||||
# Method 2: pdftotext via subprocess (inherently timeout-safe)
|
# Method 2: pdftotext via subprocess (inherently timeout-safe)
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'],
|
['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'],
|
||||||
capture_output=True, text=True, timeout=page_timeout
|
capture_output=True, text=True, timeout=page_timeout
|
||||||
)
|
)
|
||||||
if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()):
|
if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()):
|
||||||
|
|
@ -316,7 +369,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'pdftotext'
|
return text, 'pdftotext'
|
||||||
|
|
||||||
# Method 3: pdftoppm + Tesseract OCR
|
# Method 3: pdftoppm + Tesseract OCR
|
||||||
|
|
@ -340,7 +393,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if len(text.strip()) >= 50:
|
if _text_quality_ok(text):
|
||||||
return text, 'tesseract'
|
return text, 'tesseract'
|
||||||
|
|
||||||
# Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs)
|
# Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs)
|
||||||
|
|
|
||||||
228
lib/netsyms.py
Normal file
228
lib/netsyms.py
Normal file
|
|
@ -0,0 +1,228 @@
|
||||||
|
"""
|
||||||
|
RECON Netsyms AddressDatabase2025 — SQLite-backed US+CA address lookup.
|
||||||
|
|
||||||
|
Provides 159.78M geocoded addresses as tier-2 between address book
|
||||||
|
(exact named locations) and Photon (full-text global geocoding).
|
||||||
|
|
||||||
|
Database: /mnt/nav/addresses/AddressDatabase2025.sqlite (read-only)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.netsyms')
|
||||||
|
|
||||||
|
_DB_PATH = '/mnt/nav/addresses/AddressDatabase2025.sqlite'
|
||||||
|
|
||||||
|
_conn = None
|
||||||
|
_lock = threading.Lock()
|
||||||
|
_cached_row_count = None
|
||||||
|
|
||||||
|
# US states + DC + territories, CA provinces, for free-text parsing
|
||||||
|
_STATE_CODES = {
|
||||||
|
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
|
||||||
|
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
|
||||||
|
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
|
||||||
|
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
|
||||||
|
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY',
|
||||||
|
'DC', 'PR', 'VI', 'GU', 'AS', 'MP',
|
||||||
|
# Canadian provinces
|
||||||
|
'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE',
|
||||||
|
'QC', 'SK', 'YT',
|
||||||
|
}
|
||||||
|
|
||||||
|
_NUMBER_RE = re.compile(r'^(\d+[\w-]*)(.*)$')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_conn():
|
||||||
|
"""Lazy-open a read-only SQLite connection."""
|
||||||
|
global _conn
|
||||||
|
if _conn is not None:
|
||||||
|
return _conn
|
||||||
|
with _lock:
|
||||||
|
if _conn is not None:
|
||||||
|
return _conn
|
||||||
|
uri = f'file:{_DB_PATH}?mode=ro'
|
||||||
|
_conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
|
||||||
|
_conn.row_factory = sqlite3.Row
|
||||||
|
logger.info("Netsyms DB opened: %s", _DB_PATH)
|
||||||
|
return _conn
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_dict(row):
|
||||||
|
"""Convert a sqlite3.Row to a plain dict with lat/lon keys."""
|
||||||
|
return {
|
||||||
|
'zipcode': row['zipcode'],
|
||||||
|
'number': row['number'],
|
||||||
|
'street': row['street'],
|
||||||
|
'street2': row['street2'],
|
||||||
|
'city': row['city'],
|
||||||
|
'state': row['state'],
|
||||||
|
'plus4': row['plus4'],
|
||||||
|
'country': row['country'],
|
||||||
|
'lat': float(row['latitude']),
|
||||||
|
'lon': float(row['longitude']),
|
||||||
|
'source': row['source'],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_by_street(number, street, city=None, state=None,
|
||||||
|
zipcode=None, country=None, limit=20):
|
||||||
|
"""Match on number + street, with optional qualifiers."""
|
||||||
|
conn = _get_conn()
|
||||||
|
clauses = ['number = ?', 'street = ?']
|
||||||
|
params = [str(number).strip().upper(), street.strip().upper()]
|
||||||
|
|
||||||
|
if city:
|
||||||
|
clauses.append('city = ?')
|
||||||
|
params.append(city.strip().upper())
|
||||||
|
if state:
|
||||||
|
clauses.append('state = ?')
|
||||||
|
params.append(state.strip().upper())
|
||||||
|
if zipcode:
|
||||||
|
clauses.append('zipcode = ?')
|
||||||
|
params.append(zipcode.strip())
|
||||||
|
if country:
|
||||||
|
clauses.append('country = ?')
|
||||||
|
params.append(country.strip().upper())
|
||||||
|
|
||||||
|
sql = f"SELECT * FROM addresses WHERE {' AND '.join(clauses)} LIMIT ?"
|
||||||
|
params.append(limit)
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.warning("Netsyms lookup_by_street error: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [_row_to_dict(r) for r in rows]
|
||||||
|
logger.debug("lookup_by_street(%s, %s, city=%s, state=%s) → %d results",
|
||||||
|
number, street, city, state, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_free_text(query, country_hint=None):
|
||||||
|
"""Parse a free-text address and look it up."""
|
||||||
|
q = query.strip()
|
||||||
|
if not q:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Strip trailing zipcode if present
|
||||||
|
zipcode = None
|
||||||
|
zip_match = re.search(r'\b(\d{5})\s*$', q)
|
||||||
|
if zip_match:
|
||||||
|
zipcode = zip_match.group(1)
|
||||||
|
q = q[:zip_match.start()].strip().rstrip(',').strip()
|
||||||
|
|
||||||
|
# Strip trailing state
|
||||||
|
tokens = re.split(r'[,\s]+', q)
|
||||||
|
tokens = [t for t in tokens if t]
|
||||||
|
if not tokens:
|
||||||
|
return []
|
||||||
|
|
||||||
|
state = None
|
||||||
|
if len(tokens) >= 2 and tokens[-1].upper() in _STATE_CODES:
|
||||||
|
state = tokens[-1].upper()
|
||||||
|
tokens = tokens[:-1]
|
||||||
|
|
||||||
|
# Leading digits → number
|
||||||
|
number = None
|
||||||
|
if tokens and re.match(r'^\d', tokens[0]):
|
||||||
|
number = tokens[0]
|
||||||
|
tokens = tokens[1:]
|
||||||
|
|
||||||
|
if not tokens:
|
||||||
|
# Only a number, or empty — try zipcode if we have one
|
||||||
|
if zipcode:
|
||||||
|
return lookup_by_zipcode(zipcode, limit=20)
|
||||||
|
return []
|
||||||
|
|
||||||
|
# If state was found and we have 2+ tokens remaining, last token is city
|
||||||
|
city = None
|
||||||
|
if state and len(tokens) >= 2:
|
||||||
|
city = tokens[-1]
|
||||||
|
tokens = tokens[:-1]
|
||||||
|
|
||||||
|
street = ' '.join(tokens)
|
||||||
|
|
||||||
|
if number:
|
||||||
|
results = lookup_by_street(number, street, city=city, state=state,
|
||||||
|
zipcode=zipcode, country=country_hint)
|
||||||
|
if results:
|
||||||
|
logger.debug("lookup_free_text(%r) → %d results via street match",
|
||||||
|
query, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Fallback: try zipcode only if available
|
||||||
|
if zipcode:
|
||||||
|
return lookup_by_zipcode(zipcode, limit=20)
|
||||||
|
|
||||||
|
logger.debug("lookup_free_text(%r) → 0 results", query)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_by_zipcode(zipcode, limit=100):
|
||||||
|
"""Direct zipcode lookup."""
|
||||||
|
conn = _get_conn()
|
||||||
|
sql = "SELECT * FROM addresses WHERE zipcode = ? LIMIT ?"
|
||||||
|
params = [zipcode.strip(), limit]
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.warning("Netsyms lookup_by_zipcode error: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = [_row_to_dict(r) for r in rows]
|
||||||
|
logger.debug("lookup_by_zipcode(%s) → %d results", zipcode, len(results))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def health():
|
||||||
|
"""Health check with cached row count."""
|
||||||
|
global _cached_row_count
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_size = os.path.getsize(_DB_PATH)
|
||||||
|
except OSError:
|
||||||
|
return {'ok': False, 'row_count': 0, 'file_size_bytes': 0,
|
||||||
|
'indexed_countries': []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = _get_conn()
|
||||||
|
except Exception:
|
||||||
|
return {'ok': False, 'row_count': 0, 'file_size_bytes': file_size,
|
||||||
|
'indexed_countries': []}
|
||||||
|
|
||||||
|
if _cached_row_count is None:
|
||||||
|
with _lock:
|
||||||
|
if _cached_row_count is None:
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT COUNT(*) AS cnt FROM addresses"
|
||||||
|
).fetchone()
|
||||||
|
_cached_row_count = row['cnt']
|
||||||
|
except sqlite3.Error:
|
||||||
|
_cached_row_count = 0
|
||||||
|
|
||||||
|
with _lock:
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT DISTINCT country FROM addresses"
|
||||||
|
).fetchall()
|
||||||
|
countries = sorted(r['country'] for r in rows)
|
||||||
|
except sqlite3.Error:
|
||||||
|
countries = []
|
||||||
|
|
||||||
|
return {
|
||||||
|
'ok': True,
|
||||||
|
'row_count': _cached_row_count,
|
||||||
|
'file_size_bytes': file_size,
|
||||||
|
'indexed_countries': countries,
|
||||||
|
}
|
||||||
31
lib/netsyms_api.py
Normal file
31
lib/netsyms_api.py
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
"""
|
||||||
|
RECON Netsyms API — Flask Blueprint.
|
||||||
|
|
||||||
|
GET /api/netsyms/lookup?q=<free text>&country=<optional>
|
||||||
|
GET /api/netsyms/health
|
||||||
|
"""
|
||||||
|
|
||||||
|
from flask import Blueprint, request, jsonify
|
||||||
|
|
||||||
|
from . import netsyms
|
||||||
|
from .utils import setup_logging
|
||||||
|
|
||||||
|
logger = setup_logging('recon.netsyms_api')
|
||||||
|
|
||||||
|
netsyms_bp = Blueprint('netsyms', __name__)
|
||||||
|
|
||||||
|
|
||||||
|
@netsyms_bp.route('/api/netsyms/lookup')
|
||||||
|
def api_netsyms_lookup():
|
||||||
|
q = request.args.get('q', '').strip()
|
||||||
|
if not q:
|
||||||
|
return jsonify({'error': 'Missing q parameter'}), 400
|
||||||
|
|
||||||
|
country = request.args.get('country', '').strip() or None
|
||||||
|
results = netsyms.lookup_free_text(q, country_hint=country)
|
||||||
|
return jsonify({'results': results, 'count': len(results), 'query': q})
|
||||||
|
|
||||||
|
|
||||||
|
@netsyms_bp.route('/api/netsyms/health')
|
||||||
|
def api_netsyms_health():
|
||||||
|
return jsonify(netsyms.health())
|
||||||
80
lib/netsyms_test.py
Normal file
80
lib/netsyms_test.py
Normal file
|
|
@ -0,0 +1,80 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Tests for Netsyms address database module."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Ensure the lib directory is importable
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from lib import netsyms
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_street_lowercase():
|
||||||
|
results = netsyms.lookup_by_street("214", "North St", city="Filer", state="ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}"
|
||||||
|
assert abs(r['lon'] - (-114.6066)) < 0.01, f"Lon mismatch: {r['lon']}"
|
||||||
|
print(" PASS: lookup_by_street (lowercase)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_street_uppercase():
|
||||||
|
results = netsyms.lookup_by_street("214", "NORTH ST", city="FILER", state="ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert abs(r['lat'] - 42.5736) < 0.01, f"Lat mismatch: {r['lat']}"
|
||||||
|
print(" PASS: lookup_by_street (uppercase)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_nonexistent():
|
||||||
|
results = netsyms.lookup_by_street("999999", "Nonexistent Rd",
|
||||||
|
city="Filer", state="ID")
|
||||||
|
assert results == [], f"Expected empty list, got {len(results)} results"
|
||||||
|
print(" PASS: lookup_by_street (nonexistent)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_text_with_commas():
|
||||||
|
results = netsyms.lookup_free_text("214 North St, Filer, ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert r['city'] == 'FILER', f"City mismatch: {r['city']}"
|
||||||
|
assert r['state'] == 'ID', f"State mismatch: {r['state']}"
|
||||||
|
print(" PASS: lookup_free_text (commas)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_free_text_no_commas():
|
||||||
|
results = netsyms.lookup_free_text("214 North St Filer ID")
|
||||||
|
assert len(results) >= 1, f"Expected at least 1 result, got {len(results)}"
|
||||||
|
r = results[0]
|
||||||
|
assert r['state'] == 'ID', f"State mismatch: {r['state']}"
|
||||||
|
print(" PASS: lookup_free_text (no commas)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_by_zipcode():
|
||||||
|
results = netsyms.lookup_by_zipcode("83328", limit=5)
|
||||||
|
assert len(results) == 5, f"Expected 5 results, got {len(results)}"
|
||||||
|
for r in results:
|
||||||
|
assert r['zipcode'] == '83328', f"Zipcode mismatch: {r['zipcode']}"
|
||||||
|
print(" PASS: lookup_by_zipcode")
|
||||||
|
|
||||||
|
|
||||||
|
def test_health():
|
||||||
|
h = netsyms.health()
|
||||||
|
assert h['ok'] is True, f"Health not OK: {h}"
|
||||||
|
assert h['row_count'] >= 159_000_000, f"Row count too low: {h['row_count']}"
|
||||||
|
assert 'US' in h['indexed_countries'], f"US not in countries: {h['indexed_countries']}"
|
||||||
|
assert 'CA' in h['indexed_countries'], f"CA not in countries: {h['indexed_countries']}"
|
||||||
|
print(" PASS: health")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print("Running Netsyms tests...")
|
||||||
|
test_lookup_by_street_lowercase()
|
||||||
|
test_lookup_by_street_uppercase()
|
||||||
|
test_lookup_nonexistent()
|
||||||
|
test_free_text_with_commas()
|
||||||
|
test_free_text_no_commas()
|
||||||
|
test_lookup_by_zipcode()
|
||||||
|
test_health()
|
||||||
|
print("All tests passed.")
|
||||||
493
lib/processors/zim_processor.py
Normal file
493
lib/processors/zim_processor.py
Normal file
|
|
@ -0,0 +1,493 @@
|
||||||
|
"""
|
||||||
|
RECON ZIM Processor
|
||||||
|
|
||||||
|
Batch importer for ZIM files. Opens a ZIM via python-libzim, iterates
|
||||||
|
HTML articles, strips to clean text, creates processing directories,
|
||||||
|
and registers each article as "extracted" for the enricher to pick up.
|
||||||
|
|
||||||
|
This is NOT a dispatcher-style processor (no pre_flight). ZIMs contain
|
||||||
|
thousands of articles — ingestion is triggered explicitly or by the
|
||||||
|
ZIM monitor.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 -m lib.processors.zim_processor --zim-source-id 1
|
||||||
|
python3 -m lib.processors.zim_processor --zim-source-id 1 --limit 100 --batch-size 50
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from lxml import html as lxml_html
|
||||||
|
|
||||||
|
sys.path.insert(0, "/opt/recon")
|
||||||
|
|
||||||
|
from lib.utils import setup_logging, get_config
|
||||||
|
from lib.status import StatusDB
|
||||||
|
from lib.web_scraper import chunk_text
|
||||||
|
|
||||||
|
logger = logging.getLogger("recon.processors.zim")
|
||||||
|
|
||||||
|
WORDS_PER_PAGE = 2000
|
||||||
|
MIN_TEXT_LENGTH = 200
|
||||||
|
|
||||||
|
# Elements to strip before text extraction
|
||||||
|
STRIP_TAGS = {'nav', 'footer', 'script', 'style', 'header', 'aside'}
|
||||||
|
|
||||||
|
# Non-English article path suffix pattern (MediaWiki ZIMs use /XX or /XXX suffixes)
|
||||||
|
# Matches paths ending in /xx where xx is a 2-3 letter lowercase language code
|
||||||
|
_LANG_SUFFIX_RE = re.compile(r'/[a-z]{2,3}$')
|
||||||
|
# Common ISO 639-1/2 language codes to filter (excludes 'en')
|
||||||
|
_NON_EN_LANGS = {
|
||||||
|
'aa','ab','af','ak','am','an','ar','as','av','ay','az',
|
||||||
|
'ba','be','bg','bh','bi','bm','bn','bo','br','bs',
|
||||||
|
'ca','ce','ch','co','cr','cs','cu','cv','cy',
|
||||||
|
'da','de','dv','dz',
|
||||||
|
'ee','el','eo','es','et','eu',
|
||||||
|
'fa','ff','fi','fj','fo','fr','fy',
|
||||||
|
'ga','gd','gl','gn','gu','gv',
|
||||||
|
'ha','he','hi','ho','hr','ht','hu','hy','hz',
|
||||||
|
'ia','id','ie','ig','ii','ik','io','is','it','iu',
|
||||||
|
'ja','jv',
|
||||||
|
'ka','kg','ki','kj','kk','kl','km','kn','ko','kr','ks','ku','kv','kw','ky',
|
||||||
|
'la','lb','lg','li','ln','lo','lt','lu','lv',
|
||||||
|
'mg','mh','mi','mk','ml','mn','mo','mr','ms','mt','my',
|
||||||
|
'na','nb','nd','ne','ng','nl','nn','no','nr','nv','ny',
|
||||||
|
'oc','oj','om','or','os',
|
||||||
|
'pa','pi','pl','ps','pt',
|
||||||
|
'qu',
|
||||||
|
'rm','rn','ro','ru','rw',
|
||||||
|
'sa','sc','sd','se','sg','sh','si','sk','sl','sm','sn','so','sq','sr','ss','st','su','sv','sw',
|
||||||
|
'ta','te','tg','th','ti','tk','tl','tn','to','tr','ts','tt','tw','ty',
|
||||||
|
'ug','uk','ur','uz',
|
||||||
|
've','vi','vo',
|
||||||
|
'wa','wo',
|
||||||
|
'xh',
|
||||||
|
'yi','yo',
|
||||||
|
'za','zh','zu',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _text_hash(text):
|
||||||
|
"""Compute MD5 hash of text content (matching content_hash style)."""
|
||||||
|
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_table(table_el):
|
||||||
|
"""Convert a <table> element to pipe-delimited text.
|
||||||
|
|
||||||
|
Each <tr> becomes a row with cells joined by ' | '.
|
||||||
|
Returns the formatted table as a string with blank lines around it.
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for tr in table_el.iter('tr'):
|
||||||
|
cells = []
|
||||||
|
for cell in tr:
|
||||||
|
if cell.tag in ('td', 'th'):
|
||||||
|
cell_text = (cell.text_content() or '').strip()
|
||||||
|
# Collapse internal whitespace in each cell
|
||||||
|
cell_text = re.sub(r'\s+', ' ', cell_text)
|
||||||
|
if cell_text:
|
||||||
|
cells.append(cell_text)
|
||||||
|
if cells:
|
||||||
|
rows.append(' | '.join(cells))
|
||||||
|
if not rows:
|
||||||
|
return ''
|
||||||
|
return '\n'.join(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_tree(doc):
|
||||||
|
"""Pre-process HTML tree to add delimiters before text_content() flattens it.
|
||||||
|
|
||||||
|
Handles: <table>, <br>, <li>, <dt>, <dd> -- elements that lxml's
|
||||||
|
text_content() would concatenate without any separators.
|
||||||
|
"""
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# 1. Replace <table> elements with their pipe-delimited text
|
||||||
|
for table in list(doc.iter('table')):
|
||||||
|
formatted = _flatten_table(table)
|
||||||
|
if formatted:
|
||||||
|
replacement = etree.Element('div')
|
||||||
|
replacement.text = '\n\n' + formatted + '\n\n'
|
||||||
|
parent = table.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
parent.replace(table, replacement)
|
||||||
|
else:
|
||||||
|
table.drop_tree()
|
||||||
|
|
||||||
|
# 2. <br> -> inject newline
|
||||||
|
for br in list(doc.iter('br')):
|
||||||
|
br.tail = '\n' + (br.tail or '')
|
||||||
|
|
||||||
|
# 3. <li> -> inject newline + "- " prefix
|
||||||
|
for li in list(doc.iter('li')):
|
||||||
|
li.text = '- ' + (li.text or '')
|
||||||
|
li.tail = '\n' + (li.tail or '')
|
||||||
|
|
||||||
|
# 4. <dt> -> inject newline before
|
||||||
|
for dt in list(doc.iter('dt')):
|
||||||
|
dt.tail = '\n' + (dt.tail or '')
|
||||||
|
|
||||||
|
# 5. <dd> -> inject newline + indent
|
||||||
|
for dd in list(doc.iter('dd')):
|
||||||
|
dd.text = ' ' + (dd.text or '')
|
||||||
|
dd.tail = '\n' + (dd.tail or '')
|
||||||
|
|
||||||
|
|
||||||
|
def _html_to_text(html_bytes):
|
||||||
|
"""Convert HTML bytes to clean text via lxml.
|
||||||
|
|
||||||
|
Strips nav, footer, script, style elements. Decodes entities.
|
||||||
|
Pre-processes tables, lists, and line breaks for proper delimiters.
|
||||||
|
Normalizes whitespace.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
doc = lxml_html.fromstring(html_bytes)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Strip unwanted elements
|
||||||
|
for tag in STRIP_TAGS:
|
||||||
|
for el in doc.iter(tag):
|
||||||
|
el.drop_tree()
|
||||||
|
|
||||||
|
# Pre-process tree: tables -> pipe-delimited, br -> newlines, li -> dashes
|
||||||
|
_preprocess_tree(doc)
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
text = doc.text_content()
|
||||||
|
|
||||||
|
# Normalize whitespace: collapse runs of spaces, normalize newlines
|
||||||
|
text = re.sub(r'[ \t]+', ' ', text)
|
||||||
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_zim(zim_source_id, db, config, stop_event=None,
|
||||||
|
batch_size=100, batch_delay=1.0, limit=None):
|
||||||
|
"""Process all articles from a ZIM file registered in zim_sources.
|
||||||
|
|
||||||
|
- Reads zim_path from zim_sources table
|
||||||
|
- Iterates articles, creates processing dirs, registers in DB
|
||||||
|
- Checkpoints progress via zim_sources.last_checkpoint
|
||||||
|
- Respects stop_event for graceful shutdown
|
||||||
|
- Yields after each batch to avoid monopolizing resources
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zim_source_id: ID in zim_sources table
|
||||||
|
db: StatusDB instance
|
||||||
|
config: RECON config dict
|
||||||
|
stop_event: threading.Event for graceful shutdown (optional)
|
||||||
|
batch_size: articles per batch before sleeping
|
||||||
|
batch_delay: seconds to sleep between batches
|
||||||
|
limit: max articles to process (None = all)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with counts: processed, skipped, duplicates, errors
|
||||||
|
"""
|
||||||
|
from libzim.reader import Archive
|
||||||
|
|
||||||
|
conn = db._get_conn()
|
||||||
|
|
||||||
|
# Load ZIM source record
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM zim_sources WHERE id = ?", (zim_source_id,)
|
||||||
|
).fetchone()
|
||||||
|
if not row:
|
||||||
|
logger.error("ZIM source ID %d not found", zim_source_id)
|
||||||
|
return {'processed': 0, 'skipped': 0, 'duplicates': 0, 'errors': 0}
|
||||||
|
|
||||||
|
zim_source = dict(row)
|
||||||
|
zim_path = zim_source['zim_path']
|
||||||
|
zim_filename = zim_source['zim_filename']
|
||||||
|
zim_title = zim_source.get('title') or zim_filename
|
||||||
|
|
||||||
|
if not os.path.isfile(zim_path):
|
||||||
|
logger.error("ZIM file not found: %s", zim_path)
|
||||||
|
return {'processed': 0, 'skipped': 0, 'duplicates': 0, 'errors': 0}
|
||||||
|
|
||||||
|
logger.info("Opening ZIM: %s (%s)", zim_title, zim_filename)
|
||||||
|
zim = Archive(zim_path)
|
||||||
|
total_entries = zim.entry_count
|
||||||
|
|
||||||
|
# Read checkpoint to resume from
|
||||||
|
last_checkpoint = zim_source.get('last_checkpoint')
|
||||||
|
start_idx = 0
|
||||||
|
if last_checkpoint:
|
||||||
|
try:
|
||||||
|
start_idx = int(last_checkpoint)
|
||||||
|
logger.info("Resuming from checkpoint: entry %d", start_idx)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning("Invalid checkpoint value: %s, starting from 0", last_checkpoint)
|
||||||
|
|
||||||
|
# Update status to ingesting
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE zim_sources SET status = 'ingesting', started_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||||
|
(zim_source_id,)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
processing_root = config.get('pipeline', {}).get(
|
||||||
|
'processing_root', '/opt/recon/data/processing'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get already-processed article paths for this ZIM source (dedup within ZIM)
|
||||||
|
existing_paths = set()
|
||||||
|
for r in conn.execute(
|
||||||
|
"SELECT article_path FROM zim_articles WHERE zim_source_id = ?",
|
||||||
|
(zim_source_id,)
|
||||||
|
).fetchall():
|
||||||
|
existing_paths.add(r['article_path'])
|
||||||
|
|
||||||
|
stats = {'processed': 0, 'skipped': 0, 'duplicates': 0, 'errors': 0}
|
||||||
|
# Track what was already flushed to DB to avoid double-counting
|
||||||
|
flushed = {'processed': 0, 'skipped': 0, 'duplicates': 0, 'errors': 0}
|
||||||
|
batch_count = 0
|
||||||
|
total_processed_this_run = 0
|
||||||
|
last_entry_idx = start_idx
|
||||||
|
|
||||||
|
for entry_idx in range(start_idx, total_entries):
|
||||||
|
if stop_event and stop_event.is_set():
|
||||||
|
logger.info("Stop event set, halting ZIM ingest at entry %d", entry_idx)
|
||||||
|
break
|
||||||
|
|
||||||
|
if limit and total_processed_this_run >= limit:
|
||||||
|
logger.info("Reached limit of %d articles", limit)
|
||||||
|
break
|
||||||
|
|
||||||
|
last_entry_idx = entry_idx
|
||||||
|
|
||||||
|
try:
|
||||||
|
entry = zim._get_entry_by_id(entry_idx)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip redirects
|
||||||
|
if entry.is_redirect:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
item = entry.get_item()
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip non-HTML
|
||||||
|
if item.mimetype != "text/html":
|
||||||
|
continue
|
||||||
|
|
||||||
|
article_path = entry.path
|
||||||
|
article_title = entry.title
|
||||||
|
|
||||||
|
# Skip if already processed in a prior run
|
||||||
|
if article_path in existing_paths:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip non-English articles (MediaWiki translation suffix pattern)
|
||||||
|
lang_match = _LANG_SUFFIX_RE.search(article_path)
|
||||||
|
if lang_match and lang_match.group(0)[1:] in _NON_EN_LANGS:
|
||||||
|
stats['skipped'] += 1
|
||||||
|
total_processed_this_run += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract and clean text
|
||||||
|
try:
|
||||||
|
html_bytes = bytes(item.content)
|
||||||
|
clean_text = _html_to_text(html_bytes)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("HTML extraction failed for %s: %s", article_path, e)
|
||||||
|
stats['errors'] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip stubs
|
||||||
|
if len(clean_text) < MIN_TEXT_LENGTH:
|
||||||
|
stats['skipped'] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute content hash
|
||||||
|
file_hash = _text_hash(clean_text)
|
||||||
|
|
||||||
|
# Deduplicate against existing catalogue
|
||||||
|
cat_row = conn.execute(
|
||||||
|
"SELECT hash FROM catalogue WHERE hash = ?", (file_hash,)
|
||||||
|
).fetchone()
|
||||||
|
if cat_row:
|
||||||
|
# Record in zim_articles as skipped duplicate
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO zim_articles
|
||||||
|
(zim_source_id, article_path, article_title, status, processed_at)
|
||||||
|
VALUES (?, ?, ?, 'skipped', CURRENT_TIMESTAMP)""",
|
||||||
|
(zim_source_id, article_path, article_title)
|
||||||
|
)
|
||||||
|
stats['duplicates'] += 1
|
||||||
|
total_processed_this_run += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create processing directory
|
||||||
|
proc_dir = os.path.join(processing_root, file_hash)
|
||||||
|
try:
|
||||||
|
os.makedirs(proc_dir, exist_ok=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Cannot create processing dir %s: %s", proc_dir, e)
|
||||||
|
stats['errors'] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split into page files
|
||||||
|
pages = chunk_text(clean_text, WORDS_PER_PAGE)
|
||||||
|
for i, page_text in enumerate(pages, start=1):
|
||||||
|
page_path = os.path.join(proc_dir, "page_{:04d}.txt".format(i))
|
||||||
|
with open(page_path, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(page_text)
|
||||||
|
|
||||||
|
# Write meta.json
|
||||||
|
meta = {
|
||||||
|
'hash': file_hash,
|
||||||
|
'filename': article_title + '.html',
|
||||||
|
'source_type': 'zim',
|
||||||
|
'zim_file': zim_filename,
|
||||||
|
'zim_source_id': zim_source_id,
|
||||||
|
'article_title': article_title,
|
||||||
|
'article_path': article_path,
|
||||||
|
'page_count': len(pages),
|
||||||
|
'text_length': len(clean_text),
|
||||||
|
}
|
||||||
|
with open(os.path.join(proc_dir, 'meta.json'), 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(meta, f, indent=2)
|
||||||
|
|
||||||
|
# Register in catalogue
|
||||||
|
db.add_to_catalogue(
|
||||||
|
file_hash,
|
||||||
|
article_title + '.html',
|
||||||
|
zim_path, # source path is the ZIM file
|
||||||
|
len(clean_text), # size in bytes (text)
|
||||||
|
'kiwix', # source
|
||||||
|
zim_title, # category = ZIM title
|
||||||
|
)
|
||||||
|
|
||||||
|
# Queue document
|
||||||
|
db.queue_document(file_hash)
|
||||||
|
|
||||||
|
# Set text_dir, page_count, book_title on documents row
|
||||||
|
# Mark organized_at immediately (ZIM articles don't get filed to library)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE documents SET text_dir = ?, page_count = ?, "
|
||||||
|
"book_title = ?, organized_at = CURRENT_TIMESTAMP "
|
||||||
|
"WHERE hash = ?",
|
||||||
|
(proc_dir, len(pages), article_title, file_hash)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update status to extracted
|
||||||
|
db.update_status(file_hash, 'extracted', pages_extracted=len(pages))
|
||||||
|
|
||||||
|
# Record in zim_articles
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT OR IGNORE INTO zim_articles
|
||||||
|
(zim_source_id, article_path, article_title, status, processed_at)
|
||||||
|
VALUES (?, ?, ?, 'pending', CURRENT_TIMESTAMP)""",
|
||||||
|
(zim_source_id, article_path, article_title)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
stats['processed'] += 1
|
||||||
|
total_processed_this_run += 1
|
||||||
|
batch_count += 1
|
||||||
|
|
||||||
|
# Progress logging
|
||||||
|
total_done = zim_source['processed_count'] + stats['processed']
|
||||||
|
article_count = zim_source.get('article_count', 0)
|
||||||
|
if stats['processed'] % 500 == 0 and article_count > 0:
|
||||||
|
pct = total_done / article_count * 100
|
||||||
|
logger.info(
|
||||||
|
"ZIM ingest [%s]: %s/%s (%.1f%%)",
|
||||||
|
zim_title, f"{total_done:,}", f"{article_count:,}", pct
|
||||||
|
)
|
||||||
|
|
||||||
|
# Batch checkpoint — flush only the delta since last flush
|
||||||
|
if batch_count >= batch_size:
|
||||||
|
delta_p = stats['processed'] - flushed['processed']
|
||||||
|
delta_s = (stats['skipped'] + stats['duplicates']) - (flushed['skipped'] + flushed['duplicates'])
|
||||||
|
delta_e = stats['errors'] - flushed['errors']
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE zim_sources SET processed_count = processed_count + ?, "
|
||||||
|
"skipped_count = skipped_count + ?, error_count = error_count + ?, "
|
||||||
|
"last_checkpoint = ? WHERE id = ?",
|
||||||
|
(delta_p, delta_s, delta_e, str(entry_idx + 1), zim_source_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
flushed['processed'] = stats['processed']
|
||||||
|
flushed['skipped'] = stats['skipped']
|
||||||
|
flushed['duplicates'] = stats['duplicates']
|
||||||
|
flushed['errors'] = stats['errors']
|
||||||
|
|
||||||
|
batch_count = 0
|
||||||
|
|
||||||
|
if batch_delay > 0:
|
||||||
|
time.sleep(batch_delay)
|
||||||
|
|
||||||
|
# Final checkpoint — flush only the unflushed delta
|
||||||
|
final_status = 'complete'
|
||||||
|
if limit and total_processed_this_run >= limit:
|
||||||
|
final_status = 'ingesting' # not done yet, just hit the limit
|
||||||
|
|
||||||
|
delta_p = stats['processed'] - flushed['processed']
|
||||||
|
delta_s = (stats['skipped'] + stats['duplicates']) - (flushed['skipped'] + flushed['duplicates'])
|
||||||
|
delta_e = stats['errors'] - flushed['errors']
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE zim_sources SET processed_count = processed_count + ?, "
|
||||||
|
"skipped_count = skipped_count + ?, error_count = error_count + ?, "
|
||||||
|
"last_checkpoint = ?, status = ?, completed_at = CASE WHEN ? = 'complete' THEN CURRENT_TIMESTAMP ELSE completed_at END "
|
||||||
|
"WHERE id = ?",
|
||||||
|
(delta_p, delta_s, delta_e, str(last_entry_idx + 1),
|
||||||
|
final_status, final_status, zim_source_id)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"ZIM ingest [%s] %s: %d processed, %d skipped, %d duplicates, %d errors",
|
||||||
|
zim_title, final_status,
|
||||||
|
stats['processed'], stats['skipped'], stats['duplicates'], stats['errors']
|
||||||
|
)
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""CLI entry point for standalone ZIM processing."""
|
||||||
|
parser = argparse.ArgumentParser(description="RECON ZIM Processor")
|
||||||
|
parser.add_argument('--zim-source-id', type=int, required=True,
|
||||||
|
help="ID from zim_sources table")
|
||||||
|
parser.add_argument('--batch-size', type=int, default=100,
|
||||||
|
help="Articles per batch (default: 100)")
|
||||||
|
parser.add_argument('--batch-delay', type=float, default=1.0,
|
||||||
|
help="Seconds between batches (default: 1.0)")
|
||||||
|
parser.add_argument('--limit', type=int, default=None,
|
||||||
|
help="Max articles to process (default: all)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
setup_logging('recon.processors.zim')
|
||||||
|
|
||||||
|
config = get_config()
|
||||||
|
db = StatusDB(config['paths']['db'])
|
||||||
|
|
||||||
|
stats = ingest_zim(
|
||||||
|
zim_source_id=args.zim_source_id,
|
||||||
|
db=db,
|
||||||
|
config=config,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
batch_delay=args.batch_delay,
|
||||||
|
limit=args.limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nResults: {stats['processed']} processed, {stats['skipped']} skipped, "
|
||||||
|
f"{stats['duplicates']} duplicates, {stats['errors']} errors")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
161
lib/query_router.py
Normal file
161
lib/query_router.py
Normal file
|
|
@ -0,0 +1,161 @@
|
||||||
|
"""Semantic query router for Aurora.
|
||||||
|
|
||||||
|
Classifies user queries into routes (nav_route, nav_reverse_geocode,
|
||||||
|
direct_answer, rag_search) by comparing query embeddings against
|
||||||
|
pre-computed route centroids from example queries.
|
||||||
|
|
||||||
|
TEI endpoint: http://100.64.0.14:8090/embed (cortex via Tailscale)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import math
|
||||||
|
import threading
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# ── Route examples ────────────────────────────────────────────────────────────
|
||||||
|
ROUTE_EXAMPLES = {
|
||||||
|
"nav_route": [
|
||||||
|
"how do I get to Boise",
|
||||||
|
"directions to Twin Falls",
|
||||||
|
"how do I get from Buhl to Boise",
|
||||||
|
"drive from Jerome to Sun Valley",
|
||||||
|
"route from Boise to McCall",
|
||||||
|
"what's the fastest way to Sun Valley",
|
||||||
|
"how far is it to Twin Falls",
|
||||||
|
"take me to Shoshone",
|
||||||
|
"navigate to the airport",
|
||||||
|
"how do I drive to Salt Lake City",
|
||||||
|
"walking directions to the park",
|
||||||
|
"bike route to downtown",
|
||||||
|
],
|
||||||
|
"nav_reverse_geocode": [
|
||||||
|
"what town is at 42.5, -114.7",
|
||||||
|
"where am I right now",
|
||||||
|
"what is at coordinates 43.6, -116.2",
|
||||||
|
"what location is 42.574, -114.607",
|
||||||
|
"where is this place 44.0, -114.3",
|
||||||
|
"what city is near 42.7, -114.5",
|
||||||
|
"reverse geocode 43.0, -115.0",
|
||||||
|
"what's at this location 42.9, -114.8",
|
||||||
|
],
|
||||||
|
"direct_answer": [
|
||||||
|
"hello",
|
||||||
|
"hey aurora",
|
||||||
|
"good morning",
|
||||||
|
"thanks",
|
||||||
|
"thank you",
|
||||||
|
"what's your name",
|
||||||
|
"who are you",
|
||||||
|
"tell me a joke",
|
||||||
|
"how are you",
|
||||||
|
"hi there",
|
||||||
|
],
|
||||||
|
"rag_search": [
|
||||||
|
"what does the survival manual say about water",
|
||||||
|
"how to purify water in the field",
|
||||||
|
"how to treat a gunshot wound",
|
||||||
|
"what is the ranger handbook chapter on patrolling",
|
||||||
|
"field manual water purification",
|
||||||
|
"how to build a shelter in the wilderness",
|
||||||
|
"tactical combat casualty care procedures",
|
||||||
|
"what does FM 21-76 say about fire starting",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Module-level cache ────────────────────────────────────────────────────────
|
||||||
|
_ROUTE_CENTROIDS: dict | None = None
|
||||||
|
_LOCK = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def _embed_batch(texts: list[str], tei_url: str) -> list[list[float]]:
|
||||||
|
"""Embed a batch of texts via TEI."""
|
||||||
|
resp = requests.post(tei_url, json={"inputs": texts}, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_centroid(vectors: list[list[float]]) -> list[float]:
|
||||||
|
"""Element-wise mean of vectors."""
|
||||||
|
n = len(vectors)
|
||||||
|
dim = len(vectors[0])
|
||||||
|
centroid = [0.0] * dim
|
||||||
|
for vec in vectors:
|
||||||
|
for i in range(dim):
|
||||||
|
centroid[i] += vec[i]
|
||||||
|
for i in range(dim):
|
||||||
|
centroid[i] /= n
|
||||||
|
return centroid
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||||
|
"""Cosine similarity between two vectors (pure Python)."""
|
||||||
|
dot = 0.0
|
||||||
|
norm_a = 0.0
|
||||||
|
norm_b = 0.0
|
||||||
|
for i in range(len(a)):
|
||||||
|
dot += a[i] * b[i]
|
||||||
|
norm_a += a[i] * a[i]
|
||||||
|
norm_b += b[i] * b[i]
|
||||||
|
denom = math.sqrt(norm_a) * math.sqrt(norm_b)
|
||||||
|
if denom == 0:
|
||||||
|
return 0.0
|
||||||
|
return dot / denom
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_centroids(tei_url: str) -> dict[str, list[float]]:
|
||||||
|
"""Lazy-init: embed all examples in one batch, compute centroids, cache."""
|
||||||
|
global _ROUTE_CENTROIDS
|
||||||
|
if _ROUTE_CENTROIDS is not None:
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
with _LOCK:
|
||||||
|
if _ROUTE_CENTROIDS is not None:
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
# Flatten all examples into one batch
|
||||||
|
all_texts = []
|
||||||
|
route_ranges: dict[str, tuple[int, int]] = {}
|
||||||
|
offset = 0
|
||||||
|
for route, examples in ROUTE_EXAMPLES.items():
|
||||||
|
route_ranges[route] = (offset, offset + len(examples))
|
||||||
|
all_texts.extend(examples)
|
||||||
|
offset += len(examples)
|
||||||
|
|
||||||
|
all_vectors = _embed_batch(all_texts, tei_url)
|
||||||
|
|
||||||
|
centroids = {}
|
||||||
|
for route, (start, end) in route_ranges.items():
|
||||||
|
centroids[route] = _compute_centroid(all_vectors[start:end])
|
||||||
|
|
||||||
|
_ROUTE_CENTROIDS = centroids
|
||||||
|
return _ROUTE_CENTROIDS
|
||||||
|
|
||||||
|
|
||||||
|
def classify(
|
||||||
|
query: str,
|
||||||
|
tei_url: str = "http://100.64.0.14:8090/embed",
|
||||||
|
threshold: float = 0.45,
|
||||||
|
) -> tuple[str, float]:
|
||||||
|
"""Classify a query into a route.
|
||||||
|
|
||||||
|
Returns (route_name, confidence). If no route exceeds the threshold,
|
||||||
|
returns ("rag_search", best_score) as the safe default.
|
||||||
|
"""
|
||||||
|
centroids = _ensure_centroids(tei_url)
|
||||||
|
|
||||||
|
# Embed the query
|
||||||
|
vecs = _embed_batch([query], tei_url)
|
||||||
|
query_vec = vecs[0]
|
||||||
|
|
||||||
|
# Compare against all centroids
|
||||||
|
best_route = "rag_search"
|
||||||
|
best_score = 0.0
|
||||||
|
for route, centroid in centroids.items():
|
||||||
|
sim = _cosine_similarity(query_vec, centroid)
|
||||||
|
if sim > best_score:
|
||||||
|
best_score = sim
|
||||||
|
best_route = route
|
||||||
|
|
||||||
|
if best_score < threshold:
|
||||||
|
return ("rag_search", best_score)
|
||||||
|
|
||||||
|
return (best_route, best_score)
|
||||||
49
lib/query_router_test.py
Normal file
49
lib/query_router_test.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test suite for the semantic query router."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from lib.query_router import classify
|
||||||
|
|
||||||
|
TEST_QUERIES = [
|
||||||
|
("how do I get from Buhl to Boise", "nav_route"),
|
||||||
|
("what does the survival manual say about water", "rag_search"),
|
||||||
|
("what town is at 42.5, -114.7", "nav_reverse_geocode"),
|
||||||
|
("hey aurora", "direct_answer"),
|
||||||
|
("what's the fastest way to Sun Valley", "nav_route"),
|
||||||
|
("how to purify water in the field", "rag_search"),
|
||||||
|
("good morning", "direct_answer"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Query Router Test Suite")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for query, expected in TEST_QUERIES:
|
||||||
|
route, confidence = classify(query)
|
||||||
|
status = "PASS" if route == expected else "FAIL"
|
||||||
|
if status == "PASS":
|
||||||
|
passed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
print(f" [{status}] {query!r}")
|
||||||
|
print(f" → {route} ({confidence:.3f}) expected={expected}")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Results: {passed}/{passed + failed} passed")
|
||||||
|
if failed:
|
||||||
|
print(f" {failed} FAILED")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(" All tests passed!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
387
lib/scraper_runner.py
Normal file
387
lib/scraper_runner.py
Normal file
|
|
@ -0,0 +1,387 @@
|
||||||
|
"""
|
||||||
|
RECON Scraper Runner
|
||||||
|
|
||||||
|
Daemon loop that processes scrape jobs: crawl via Zimit → kiwix-manage.
|
||||||
|
Zimit (openZIM Docker crawler) handles all site types and produces ZIM
|
||||||
|
files directly — no separate zimwriterfs step needed.
|
||||||
|
|
||||||
|
Public entry point: scraper_loop(stop_event, config).
|
||||||
|
|
||||||
|
Config section: scraper (output_dir, docker_image, docker_workers, poll_interval)
|
||||||
|
DB table: scrape_jobs (status flow: pending → scraping → registering → complete)
|
||||||
|
"""
|
||||||
|
import glob as _glob
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from .utils import setup_logging
|
||||||
|
from .status import StatusDB
|
||||||
|
|
||||||
|
logger = setup_logging('recon.scraper_runner')
|
||||||
|
|
||||||
|
|
||||||
|
def scraper_loop(stop_event, config):
|
||||||
|
"""Daemon loop: poll for pending scrape jobs, execute pipeline."""
|
||||||
|
scraper_cfg = config.get('scraper', {})
|
||||||
|
poll_interval = scraper_cfg.get('poll_interval', 300)
|
||||||
|
|
||||||
|
logger.info("Scraper runner started")
|
||||||
|
|
||||||
|
# Clean up any orphan Zimit containers from a previous crash
|
||||||
|
_cleanup_orphan_containers()
|
||||||
|
|
||||||
|
while not stop_event.is_set():
|
||||||
|
db = StatusDB()
|
||||||
|
job = db.get_pending_scrape_job()
|
||||||
|
if job:
|
||||||
|
try:
|
||||||
|
_process_job(job, config, stop_event)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scraper job {job['id']} unexpected error: {e}", exc_info=True)
|
||||||
|
try:
|
||||||
|
db.update_scrape_job(job['id'],
|
||||||
|
status='failed',
|
||||||
|
error_message=str(e)[:1000],
|
||||||
|
subprocess_pid=None,
|
||||||
|
completed_at=_now())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
stop_event.wait(poll_interval)
|
||||||
|
|
||||||
|
logger.info("Scraper runner stopped")
|
||||||
|
|
||||||
|
|
||||||
|
def _now():
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_domain(url):
|
||||||
|
"""Extract and sanitize domain from URL for use in filenames."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.hostname or 'unknown'
|
||||||
|
if domain.startswith('www.'):
|
||||||
|
domain = domain[4:]
|
||||||
|
return domain
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_filename(s):
|
||||||
|
"""Sanitize a string for safe filename use."""
|
||||||
|
return re.sub(r'[^a-zA-Z0-9._-]', '_', s)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_cancelled(db, job_id):
|
||||||
|
"""Check if a job has been cancelled in the DB."""
|
||||||
|
job = db.get_scrape_job(job_id)
|
||||||
|
return job and job['status'] == 'cancelled'
|
||||||
|
|
||||||
|
|
||||||
|
def _kill_process(proc, timeout=5):
|
||||||
|
"""Gracefully terminate a subprocess, force kill if needed."""
|
||||||
|
if proc.poll() is not None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
proc.terminate()
|
||||||
|
proc.wait(timeout=timeout)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
proc.kill()
|
||||||
|
proc.wait(timeout=2)
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_orphan_containers():
|
||||||
|
"""Remove any leftover recon-scraper-* Docker containers from a previous crash."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['docker', 'ps', '-a', '--filter', 'name=recon-scraper-', '--format', '{{.Names}}'],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
for name in result.stdout.strip().split('\n'):
|
||||||
|
name = name.strip()
|
||||||
|
if name:
|
||||||
|
subprocess.run(['docker', 'rm', '-f', name], capture_output=True, timeout=10)
|
||||||
|
logger.info(f"Cleaned up orphan container: {name}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Orphan container cleanup failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Zimit crawl backend ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _crawl_zimit(job, config, stop_event, db):
|
||||||
|
"""
|
||||||
|
Crawl a URL using Zimit (openZIM Docker crawler).
|
||||||
|
|
||||||
|
Returns (page_count, zim_filename, error_msg).
|
||||||
|
On success: (count, filename, None)
|
||||||
|
On failure: (0, None, error_string)
|
||||||
|
"""
|
||||||
|
job_id = job['id']
|
||||||
|
url = job['url']
|
||||||
|
title = job.get('title') or _sanitize_domain(url)
|
||||||
|
language = job.get('language') or config.get('scraper', {}).get('default_language', 'eng')
|
||||||
|
category = job.get('category') or ''
|
||||||
|
|
||||||
|
scraper_cfg = config.get('scraper', {})
|
||||||
|
output_dir = scraper_cfg.get('output_dir', '/mnt/kiwix')
|
||||||
|
docker_image = scraper_cfg.get('docker_image', 'ghcr.io/openzim/zimit')
|
||||||
|
docker_workers = scraper_cfg.get('docker_workers', 2)
|
||||||
|
|
||||||
|
domain = _sanitize_domain(url)
|
||||||
|
date_tag = datetime.now().strftime('%Y-%m')
|
||||||
|
container_name = f'recon-scraper-{job_id}'
|
||||||
|
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
|
||||||
|
|
||||||
|
# Clean up any pre-existing container with same name (retry scenario)
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True, timeout=10)
|
||||||
|
|
||||||
|
os.makedirs(tmp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
description = f"Mirror of {domain}"
|
||||||
|
if category:
|
||||||
|
description = f"{category} — mirror of {domain}"
|
||||||
|
|
||||||
|
docker_cmd = [
|
||||||
|
'docker', 'run',
|
||||||
|
'--name', container_name,
|
||||||
|
'-v', f'{tmp_dir}:/output',
|
||||||
|
docker_image,
|
||||||
|
'zimit',
|
||||||
|
'--seeds', url,
|
||||||
|
'--name', _sanitize_filename(domain),
|
||||||
|
'--zim-lang', language,
|
||||||
|
'--title', title,
|
||||||
|
'--description', description[:80],
|
||||||
|
'--output', '/output',
|
||||||
|
'-w', str(docker_workers),
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.info(f"Job {job_id}: Zimit crawl starting — {url}")
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
docker_cmd,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
db.update_scrape_job(job_id, subprocess_pid=proc.pid)
|
||||||
|
|
||||||
|
last_progress_check = 0
|
||||||
|
while proc.poll() is None:
|
||||||
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
|
# Stop the Docker container
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
_kill_process(proc)
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, 'cancelled'
|
||||||
|
|
||||||
|
# Check progress every 30s via docker logs
|
||||||
|
now = time.time()
|
||||||
|
if now - last_progress_check >= 30:
|
||||||
|
last_progress_check = now
|
||||||
|
try:
|
||||||
|
log_result = subprocess.run(
|
||||||
|
['docker', 'logs', '--tail', '20', container_name],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
if log_result.returncode == 0:
|
||||||
|
# Browsertrix logs JSON with "crawled":N — check both stdout and stderr
|
||||||
|
log_text = log_result.stdout or log_result.stderr or ''
|
||||||
|
lines = log_text.strip().split('\n')
|
||||||
|
for line in reversed(lines):
|
||||||
|
match = re.search(r'"crawled":(\d+)', line)
|
||||||
|
if match:
|
||||||
|
count = int(match.group(1))
|
||||||
|
if count > 0:
|
||||||
|
db.update_scrape_job(job_id, page_count=count)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=5)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
pass
|
||||||
|
|
||||||
|
db.update_scrape_job(job_id, subprocess_pid=None)
|
||||||
|
|
||||||
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, 'cancelled'
|
||||||
|
|
||||||
|
if proc.returncode != 0:
|
||||||
|
# Capture last 50 lines of docker logs for error context
|
||||||
|
error_msg = f"Zimit exited with code {proc.returncode}"
|
||||||
|
try:
|
||||||
|
log_result = subprocess.run(
|
||||||
|
['docker', 'logs', '--tail', '50', container_name],
|
||||||
|
capture_output=True, text=True, timeout=10
|
||||||
|
)
|
||||||
|
log_text = (log_result.stderr or log_result.stdout or '').strip()
|
||||||
|
if log_text:
|
||||||
|
# Take last 500 chars
|
||||||
|
error_msg += f": {log_text[-500:]}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Remove container (no --rm flag, so we clean up manually)
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, error_msg
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, f"Zimit error: {e}"
|
||||||
|
|
||||||
|
# Remove container (no --rm flag, so we clean up manually after getting logs)
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
|
||||||
|
# Find the output ZIM file
|
||||||
|
zim_files = _glob.glob(os.path.join(tmp_dir, '*.zim'))
|
||||||
|
if not zim_files:
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, 'Zimit produced no ZIM file'
|
||||||
|
|
||||||
|
src_zim = zim_files[0] # Should be exactly one
|
||||||
|
|
||||||
|
# Get page count from file size as rough estimate if we don't have one
|
||||||
|
page_count = 0
|
||||||
|
try:
|
||||||
|
job_state = db.get_scrape_job(job_id)
|
||||||
|
page_count = job_state.get('page_count') or 0
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Rename to final location
|
||||||
|
zim_filename = f"{_sanitize_filename(domain)}_{language}_{date_tag}_{job_id}.zim"
|
||||||
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
|
try:
|
||||||
|
shutil.move(src_zim, zim_path)
|
||||||
|
except Exception as e:
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
return 0, None, f"Failed to move ZIM to output dir: {e}"
|
||||||
|
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
logger.info(f"Job {job_id}: Zimit complete — {zim_filename}")
|
||||||
|
|
||||||
|
return page_count, zim_filename, None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main job pipeline ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _process_job(job, config, stop_event):
|
||||||
|
"""Execute the full scrape pipeline for a single job."""
|
||||||
|
db = StatusDB()
|
||||||
|
job_id = job['id']
|
||||||
|
|
||||||
|
logger.info(f"Job {job_id}: starting scrape of {job['url']}")
|
||||||
|
|
||||||
|
# ── Phase 1: Crawl via Zimit ───────────────────────────────────
|
||||||
|
db.update_scrape_job(job_id,
|
||||||
|
status='scraping',
|
||||||
|
crawl_mode='zimit',
|
||||||
|
started_at=_now())
|
||||||
|
|
||||||
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
|
_handle_cancel(db, job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
page_count, zim_filename, error = _crawl_zimit(job, config, stop_event, db)
|
||||||
|
|
||||||
|
if error == 'cancelled':
|
||||||
|
_handle_cancel(db, job_id)
|
||||||
|
return
|
||||||
|
elif error:
|
||||||
|
db.update_scrape_job(job_id,
|
||||||
|
status='failed',
|
||||||
|
error_message=error[:1000],
|
||||||
|
subprocess_pid=None,
|
||||||
|
completed_at=_now())
|
||||||
|
return
|
||||||
|
|
||||||
|
db.update_scrape_job(job_id, page_count=page_count)
|
||||||
|
|
||||||
|
# ── Phase 2: Register with kiwix-serve ─────────────────────────
|
||||||
|
if stop_event.is_set() or _check_cancelled(db, job_id):
|
||||||
|
_handle_cancel(db, job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
db.update_scrape_job(job_id, status='registering')
|
||||||
|
|
||||||
|
output_dir = config.get('scraper', {}).get('output_dir', '/mnt/kiwix')
|
||||||
|
zim_path = os.path.join(output_dir, zim_filename)
|
||||||
|
kiwix_manage = shutil.which('kiwix-manage') or '/opt/recon/bin/kiwix-manage'
|
||||||
|
library_xml = '/mnt/kiwix/library.xml'
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[kiwix_manage, library_xml, 'add', zim_path],
|
||||||
|
capture_output=True, text=True, timeout=30
|
||||||
|
)
|
||||||
|
logger.info(f"Job {job_id}: registered with kiwix-serve library")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Job {job_id}: kiwix-manage add failed: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
pid = int(result.stdout.strip().split()[0])
|
||||||
|
os.kill(pid, signal.SIGHUP)
|
||||||
|
logger.info(f"Job {job_id}: sent SIGHUP to kiwix-serve (pid {pid})")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Job {job_id}: failed to signal kiwix-serve: {e}")
|
||||||
|
|
||||||
|
# Wait for kiwix-serve to reload its catalog after SIGHUP
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
zim_source_id = None
|
||||||
|
try:
|
||||||
|
from .zim_monitor import scan_zims
|
||||||
|
scan_zims()
|
||||||
|
conn = db._get_conn()
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id FROM zim_sources WHERE zim_filename = ?", (zim_filename,)
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
zim_source_id = row['id']
|
||||||
|
logger.info(f"Job {job_id}: linked to zim_source_id={zim_source_id}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Job {job_id}: scan_zims failed: {e}")
|
||||||
|
|
||||||
|
# ── Phase 3: Complete ──────────────────────────────────────────
|
||||||
|
db.update_scrape_job(job_id,
|
||||||
|
status='complete',
|
||||||
|
zim_filename=zim_filename,
|
||||||
|
zim_source_id=zim_source_id,
|
||||||
|
completed_at=_now())
|
||||||
|
|
||||||
|
logger.info(f"Job {job_id}: complete — {zim_filename} ({page_count} pages)")
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_cancel(db, job_id):
|
||||||
|
"""Handle job cancellation: clean up Docker container and update status."""
|
||||||
|
container_name = f'recon-scraper-{job_id}'
|
||||||
|
try:
|
||||||
|
subprocess.run(['docker', 'rm', '-f', container_name],
|
||||||
|
capture_output=True, timeout=10)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean up tmp dir if it exists
|
||||||
|
output_dir = '/mnt/kiwix'
|
||||||
|
tmp_dir = os.path.join(output_dir, f'.zimit-tmp-{job_id}')
|
||||||
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
logger.info(f"Job {job_id}: cancelled")
|
||||||
|
db.update_scrape_job(job_id,
|
||||||
|
status='cancelled',
|
||||||
|
subprocess_pid=None,
|
||||||
|
completed_at=_now())
|
||||||
|
|
@ -105,6 +105,25 @@ class StatusDB:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # column already exists
|
pass # column already exists
|
||||||
|
|
||||||
|
# Migration: add subprocess_pid column to scrape_jobs if missing
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE scrape_jobs ADD COLUMN subprocess_pid INTEGER")
|
||||||
|
except Exception:
|
||||||
|
pass # column already exists
|
||||||
|
|
||||||
|
# Migration: add reject pattern columns to scrape_jobs if missing
|
||||||
|
for col, coltype in [('additional_reject_patterns', 'TEXT'), ('skip_default_patterns', 'INTEGER DEFAULT 0')]:
|
||||||
|
try:
|
||||||
|
conn.execute(f"ALTER TABLE scrape_jobs ADD COLUMN {col} {coltype}")
|
||||||
|
except Exception:
|
||||||
|
pass # column already exists
|
||||||
|
|
||||||
|
# Migration: add crawl_mode column to scrape_jobs if missing
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE scrape_jobs ADD COLUMN crawl_mode TEXT")
|
||||||
|
except Exception:
|
||||||
|
pass # column already exists
|
||||||
|
|
||||||
# Stream B: file_operations + duplicate_review tables
|
# Stream B: file_operations + duplicate_review tables
|
||||||
conn.executescript("""
|
conn.executescript("""
|
||||||
CREATE TABLE IF NOT EXISTS file_operations (
|
CREATE TABLE IF NOT EXISTS file_operations (
|
||||||
|
|
@ -142,6 +161,28 @@ class StatusDB:
|
||||||
resolved_at TEXT
|
resolved_at TEXT
|
||||||
);
|
);
|
||||||
CREATE INDEX IF NOT EXISTS idx_dupreview_status ON duplicate_review(status);
|
CREATE INDEX IF NOT EXISTS idx_dupreview_status ON duplicate_review(status);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS scrape_jobs (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
url TEXT NOT NULL,
|
||||||
|
title TEXT,
|
||||||
|
language TEXT DEFAULT 'eng',
|
||||||
|
category TEXT,
|
||||||
|
status TEXT DEFAULT 'pending',
|
||||||
|
page_count INTEGER DEFAULT 0,
|
||||||
|
error_message TEXT,
|
||||||
|
zim_filename TEXT,
|
||||||
|
zim_source_id INTEGER,
|
||||||
|
workspace_path TEXT,
|
||||||
|
subprocess_pid INTEGER,
|
||||||
|
additional_reject_patterns TEXT,
|
||||||
|
skip_default_patterns INTEGER DEFAULT 0,
|
||||||
|
crawl_mode TEXT,
|
||||||
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
started_at TEXT,
|
||||||
|
completed_at TEXT
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_scrape_status ON scrape_jobs(status);
|
||||||
""")
|
""")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
@ -406,6 +447,50 @@ class StatusDB:
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Scraper Job Helpers ─────────────────────────────────────
|
||||||
|
|
||||||
|
def get_pending_scrape_job(self):
|
||||||
|
"""Fetch the oldest pending scrape job."""
|
||||||
|
conn = self._get_conn()
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT * FROM scrape_jobs WHERE status = 'pending' ORDER BY id ASC LIMIT 1"
|
||||||
|
).fetchone()
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
def update_scrape_job(self, job_id, **kwargs):
|
||||||
|
"""Update arbitrary columns on a scrape job."""
|
||||||
|
if not kwargs:
|
||||||
|
return
|
||||||
|
conn = self._get_conn()
|
||||||
|
sets = []
|
||||||
|
vals = []
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
sets.append(f"{k} = ?")
|
||||||
|
vals.append(v)
|
||||||
|
vals.append(job_id)
|
||||||
|
conn.execute(f"UPDATE scrape_jobs SET {', '.join(sets)} WHERE id = ?", vals)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
def get_scrape_jobs(self, status=None):
|
||||||
|
"""List scrape jobs, optionally filtered by status."""
|
||||||
|
conn = self._get_conn()
|
||||||
|
if status:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM scrape_jobs WHERE status = ? ORDER BY id DESC", (status,)
|
||||||
|
).fetchall()
|
||||||
|
else:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM scrape_jobs ORDER BY id DESC"
|
||||||
|
).fetchall()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
def get_scrape_job(self, job_id):
|
||||||
|
"""Get a single scrape job by ID."""
|
||||||
|
conn = self._get_conn()
|
||||||
|
row = conn.execute("SELECT * FROM scrape_jobs WHERE id = ?", (job_id,)).fetchone()
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
# ── Stream B: File Operations ───────────────────────────────────
|
# ── Stream B: File Operations ───────────────────────────────────
|
||||||
|
|
||||||
def log_file_operation(self, doc_hash, operation, source_path, target_path,
|
def log_file_operation(self, doc_hash, operation, source_path, target_path,
|
||||||
|
|
|
||||||
217
lib/zim_monitor.py
Normal file
217
lib/zim_monitor.py
Normal file
|
|
@ -0,0 +1,217 @@
|
||||||
|
"""
|
||||||
|
ZIM Monitor — detects ZIMs loaded in kiwix-serve and tracks them in recon.db.
|
||||||
|
|
||||||
|
Polls the kiwix-serve OPDS v2 catalog, compares against the zim_sources table,
|
||||||
|
and for new ZIMs reads accurate metadata via python-libzim's Counter field.
|
||||||
|
|
||||||
|
Standalone: python3 /opt/recon/lib/zim_monitor.py
|
||||||
|
As module: from lib.zim_monitor import scan_zims
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
sys.path.insert(0, "/opt/recon")
|
||||||
|
from lib.utils import setup_logging
|
||||||
|
|
||||||
|
try:
|
||||||
|
from libzim.reader import Archive
|
||||||
|
HAVE_LIBZIM = True
|
||||||
|
except ImportError:
|
||||||
|
HAVE_LIBZIM = False
|
||||||
|
|
||||||
|
OPDS_URL = "http://localhost:8430/catalog/v2/entries?count=-1"
|
||||||
|
ZIM_DIR = "/mnt/kiwix"
|
||||||
|
DB_PATH = "/opt/recon/data/recon.db"
|
||||||
|
|
||||||
|
ATOM_NS = "http://www.w3.org/2005/Atom"
|
||||||
|
|
||||||
|
logger = logging.getLogger("recon.zim_monitor")
|
||||||
|
|
||||||
|
|
||||||
|
def _text(element, tag, ns=ATOM_NS):
|
||||||
|
"""Get text content of a child element, or None."""
|
||||||
|
child = element.find(f"{{{ns}}}{tag}")
|
||||||
|
if child is not None and child.text:
|
||||||
|
return child.text.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_counter(counter_str):
|
||||||
|
"""Parse ZIM Counter metadata into {mimetype: count}."""
|
||||||
|
result = {}
|
||||||
|
for pair in counter_str.split(";"):
|
||||||
|
if "=" in pair:
|
||||||
|
mime, count = pair.split("=", 1)
|
||||||
|
try:
|
||||||
|
result[mime.strip()] = int(count.strip())
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_opds():
|
||||||
|
"""Fetch OPDS v2 catalog from kiwix-serve. Returns list of dicts."""
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(OPDS_URL, timeout=10) as resp:
|
||||||
|
data = resp.read()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to fetch OPDS catalog: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
root = ET.fromstring(data)
|
||||||
|
entries = []
|
||||||
|
for entry in root.findall(f"{{{ATOM_NS}}}entry"):
|
||||||
|
uuid_raw = _text(entry, "id")
|
||||||
|
uuid = uuid_raw.replace("urn:uuid:", "") if uuid_raw else None
|
||||||
|
|
||||||
|
# Derive ZIM filename from the content link href
|
||||||
|
zim_filename = None
|
||||||
|
for link in entry.findall(f"{{{ATOM_NS}}}link"):
|
||||||
|
if link.get("type") == "text/html":
|
||||||
|
href = link.get("href", "")
|
||||||
|
# href looks like /content/appropedia_en_all_maxi_2025-11
|
||||||
|
name = href.rsplit("/", 1)[-1] if "/" in href else href
|
||||||
|
if name:
|
||||||
|
zim_filename = name + ".zim"
|
||||||
|
break
|
||||||
|
|
||||||
|
entries.append({
|
||||||
|
"uuid": uuid,
|
||||||
|
"title": _text(entry, "title"),
|
||||||
|
"name": _text(entry, "name"),
|
||||||
|
"flavour": _text(entry, "flavour"),
|
||||||
|
"language": _text(entry, "language"),
|
||||||
|
"category": _text(entry, "category") or None,
|
||||||
|
"summary": _text(entry, "summary"),
|
||||||
|
"article_count_opds": int(_text(entry, "articleCount") or 0),
|
||||||
|
"zim_filename": zim_filename,
|
||||||
|
})
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def get_libzim_metadata(zim_path):
|
||||||
|
"""Open a ZIM file and read accurate metadata via python-libzim."""
|
||||||
|
if not HAVE_LIBZIM:
|
||||||
|
logger.warning("python-libzim not available, skipping metadata read")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
zim = Archive(zim_path)
|
||||||
|
meta = {}
|
||||||
|
|
||||||
|
def _get_meta(key):
|
||||||
|
try:
|
||||||
|
return zim.get_metadata(key).decode("utf-8", errors="replace")
|
||||||
|
except RuntimeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
meta["title"] = _get_meta("Title")
|
||||||
|
meta["description"] = _get_meta("Description")
|
||||||
|
meta["language"] = _get_meta("Language")
|
||||||
|
meta["tags"] = _get_meta("Tags")
|
||||||
|
|
||||||
|
counter_str = _get_meta("Counter")
|
||||||
|
if counter_str:
|
||||||
|
counts = parse_counter(counter_str)
|
||||||
|
meta["article_count"] = counts.get("text/html", 0)
|
||||||
|
meta["counter_raw"] = counter_str
|
||||||
|
else:
|
||||||
|
meta["article_count"] = 0
|
||||||
|
meta["counter_raw"] = None
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def scan_zims():
|
||||||
|
"""Compare OPDS catalog against zim_sources table. Insert/update as needed."""
|
||||||
|
logger.info("Scanning kiwix-serve OPDS catalog...")
|
||||||
|
opds_entries = fetch_opds()
|
||||||
|
if not opds_entries:
|
||||||
|
logger.info("No entries in OPDS catalog (or fetch failed)")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("OPDS returned %d entries", len(opds_entries))
|
||||||
|
|
||||||
|
con = sqlite3.connect(DB_PATH)
|
||||||
|
con.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Get existing zim_sources keyed by filename
|
||||||
|
existing = {}
|
||||||
|
for row in con.execute("SELECT id, zim_filename, status FROM zim_sources"):
|
||||||
|
existing[row["zim_filename"]] = dict(row)
|
||||||
|
|
||||||
|
opds_filenames = set()
|
||||||
|
new_count = 0
|
||||||
|
|
||||||
|
for entry in opds_entries:
|
||||||
|
filename = entry["zim_filename"]
|
||||||
|
if not filename:
|
||||||
|
logger.warning("Skipping OPDS entry with no derivable filename: %s", entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
opds_filenames.add(filename)
|
||||||
|
|
||||||
|
if filename in existing:
|
||||||
|
logger.debug("Already tracked: %s (status=%s)", filename, existing[filename]["status"])
|
||||||
|
continue
|
||||||
|
|
||||||
|
# New ZIM — read accurate metadata via python-libzim
|
||||||
|
zim_path = os.path.join(ZIM_DIR, filename)
|
||||||
|
if not os.path.isfile(zim_path):
|
||||||
|
logger.warning("ZIM file not found on disk: %s", zim_path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info("New ZIM detected: %s — reading metadata via libzim", filename)
|
||||||
|
meta = get_libzim_metadata(zim_path)
|
||||||
|
|
||||||
|
con.execute(
|
||||||
|
"""INSERT INTO zim_sources
|
||||||
|
(zim_filename, zim_path, zim_uuid, title, description,
|
||||||
|
language, category, article_count, status)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'detected')""",
|
||||||
|
(
|
||||||
|
filename,
|
||||||
|
zim_path,
|
||||||
|
entry["uuid"],
|
||||||
|
meta.get("title") or entry["title"],
|
||||||
|
meta.get("description") or entry["summary"],
|
||||||
|
meta.get("language") or entry["language"],
|
||||||
|
entry["category"],
|
||||||
|
meta.get("article_count", 0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
new_count += 1
|
||||||
|
logger.info(
|
||||||
|
" Inserted: %s — title=%r, articles=%s (OPDS said %s)",
|
||||||
|
filename,
|
||||||
|
meta.get("title") or entry["title"],
|
||||||
|
meta.get("article_count", 0),
|
||||||
|
entry["article_count_opds"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect removed ZIMs (in DB but not in OPDS, and not already marked removed)
|
||||||
|
removed_count = 0
|
||||||
|
for filename, row in existing.items():
|
||||||
|
if filename not in opds_filenames and row["status"] != "removed":
|
||||||
|
con.execute(
|
||||||
|
"UPDATE zim_sources SET status = 'removed' WHERE id = ?",
|
||||||
|
(row["id"],),
|
||||||
|
)
|
||||||
|
removed_count += 1
|
||||||
|
logger.info("Marked removed: %s", filename)
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Scan complete: %d new, %d removed, %d total in catalog",
|
||||||
|
new_count, removed_count, len(opds_entries),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
setup_logging("recon.zim_monitor")
|
||||||
|
scan_zims()
|
||||||
11
recon.py
11
recon.py
|
|
@ -692,12 +692,23 @@ def cmd_service(args):
|
||||||
daemon=True, name='dashboard'),
|
daemon=True, name='dashboard'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Scraper daemon: polls for pending scrape jobs, runs wget+zimwriterfs pipeline
|
||||||
|
scraper_cfg = config.get('scraper', {})
|
||||||
|
if scraper_cfg.get('workspace'):
|
||||||
|
from lib.scraper_runner import scraper_loop
|
||||||
|
threads.append(
|
||||||
|
threading.Thread(target=lambda: scraper_loop(stop_event, config),
|
||||||
|
daemon=True, name='scraper')
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("=== RECON Service Starting ===")
|
logger.info("=== RECON Service Starting ===")
|
||||||
logger.info(f" Dashboard: {web_host}:{web_port}")
|
logger.info(f" Dashboard: {web_host}:{web_port}")
|
||||||
logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}")
|
logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}")
|
||||||
logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
|
logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
|
||||||
pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
|
pt_interval = config.get("peertube", {}).get("poll_interval", 1800)
|
||||||
logger.info(f" PeerTube acquisition: every {pt_interval}s")
|
logger.info(f" PeerTube acquisition: every {pt_interval}s")
|
||||||
|
if scraper_cfg.get('workspace'):
|
||||||
|
logger.info(f" Scraper: every {scraper_cfg.get('poll_interval', 300)}s")
|
||||||
logger.info(f" Progress: every {progress_interval}s")
|
logger.info(f" Progress: every {progress_interval}s")
|
||||||
|
|
||||||
for t in threads:
|
for t in threads:
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ anyio==4.12.1
|
||||||
babel==2.18.0
|
babel==2.18.0
|
||||||
beautifulsoup4==4.14.3
|
beautifulsoup4==4.14.3
|
||||||
blinker==1.9.0
|
blinker==1.9.0
|
||||||
|
cachetools==7.1.3
|
||||||
certifi==2026.1.4
|
certifi==2026.1.4
|
||||||
cffi==2.0.0
|
cffi==2.0.0
|
||||||
charset-normalizer==3.4.4
|
charset-normalizer==3.4.4
|
||||||
|
|
|
||||||
|
|
@ -211,6 +211,7 @@ tr:hover { background: var(--bg-secondary); }
|
||||||
.badge-web { background: #1e3a5f; color: #60a5fa; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
.badge-web { background: #1e3a5f; color: #60a5fa; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
.badge-pdf { background: #2d5a2d; color: #4ade80; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
.badge-pdf { background: #2d5a2d; color: #4ade80; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
.badge-transcript { background: #3b1f5e; color: #c084fc; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
.badge-transcript { background: #3b1f5e; color: #c084fc; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-wiki { background: #1f4a3b; color: #34d399; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
|
||||||
/* ── Trend indicators ── */
|
/* ── Trend indicators ── */
|
||||||
.trend { font-size: 11px; margin-left: 6px; }
|
.trend { font-size: 11px; margin-left: 6px; }
|
||||||
|
|
@ -315,3 +316,19 @@ tr:hover { background: var(--bg-secondary); }
|
||||||
.errors-panel.has-errors { display: block; }
|
.errors-panel.has-errors { display: block; }
|
||||||
.errors-panel summary { color: var(--red); cursor: pointer; font-size: 13px; margin-bottom: 8px; }
|
.errors-panel summary { color: var(--red); cursor: pointer; font-size: 13px; margin-bottom: 8px; }
|
||||||
.errors-panel .error-line { color: var(--text-muted); font-size: 11px; padding: 2px 0; border-bottom: 1px solid var(--border); }
|
.errors-panel .error-line { color: var(--text-muted); font-size: 11px; padding: 2px 0; border-bottom: 1px solid var(--border); }
|
||||||
|
|
||||||
|
/* ── Toggle switch ── */
|
||||||
|
.toggle-switch { position: relative; display: inline-block; width: 40px; height: 20px; }
|
||||||
|
.toggle-switch input { opacity: 0; width: 0; height: 0; }
|
||||||
|
.toggle-slider { position: absolute; cursor: pointer; inset: 0; background: #333; border-radius: 20px; transition: 0.3s; }
|
||||||
|
.toggle-slider:before { content: ''; position: absolute; height: 16px; width: 16px; left: 2px; bottom: 2px; background: #888; border-radius: 50%; transition: 0.3s; }
|
||||||
|
.toggle-switch input:checked + .toggle-slider { background: #1a4a2e; }
|
||||||
|
.toggle-switch input:checked + .toggle-slider:before { transform: translateX(20px); background: #00ff41; }
|
||||||
|
|
||||||
|
/* ── Kiwix status badges ── */
|
||||||
|
.badge-complete { background: #1a4a2e; color: #00ff41; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-ingesting { background: #1a3a5a; color: #0ea5e9; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-detected { background: #333; color: #888; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-processing { background: #4a3a1a; color: #f59e0b; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-extracting { background: #1a3a5a; color: #0ea5e9; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
.badge-failed { background: #4a1a1a; color: #ff4444; padding: 2px 8px; border-radius: var(--radius); font-size: 11px; }
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@
|
||||||
var pipeCount = s.in_pipeline || 0;
|
var pipeCount = s.in_pipeline || 0;
|
||||||
totalCat += catCount; totalComp += compCount; totalPipe += pipeCount;
|
totalCat += catCount; totalComp += compCount; totalPipe += pipeCount;
|
||||||
totalConcepts += s.concepts; totalVectors += s.vectors;
|
totalConcepts += s.concepts; totalVectors += s.vectors;
|
||||||
var badge = s.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : s.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
|
var badge = s.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : s.type === 'web' ? '<span class="badge-web">WEB</span>' : s.type === 'wiki' ? '<span class="badge-wiki">WIKI</span>' : '<span class="badge-pdf">PDF</span>';
|
||||||
var compPct = catCount > 0 ? (compCount / catCount * 100) : 0;
|
var compPct = catCount > 0 ? (compCount / catCount * 100) : 0;
|
||||||
var pipePct = catCount > 0 ? (pipeCount / catCount * 100) : 0;
|
var pipePct = catCount > 0 ? (pipeCount / catCount * 100) : 0;
|
||||||
var compColor = compPct >= 100 ? '#00ff41' : compPct > 0 ? '#ffa500' : '#666';
|
var compColor = compPct >= 100 ? '#00ff41' : compPct > 0 ? '#ffa500' : '#666';
|
||||||
|
|
@ -185,7 +185,7 @@
|
||||||
rtb.innerHTML = '<tr><td colspan="4" class="text-dim">None yet</td></tr>';
|
rtb.innerHTML = '<tr><td colspan="4" class="text-dim">None yet</td></tr>';
|
||||||
} else {
|
} else {
|
||||||
rtb.innerHTML = data.recent_complete.map(function(r) {
|
rtb.innerHTML = data.recent_complete.map(function(r) {
|
||||||
var badge = r.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : r.type === 'web' ? '<span class="badge-web">WEB</span>' : '<span class="badge-pdf">PDF</span>';
|
var badge = r.type === 'transcript' ? '<span class="badge-transcript">TRANSCRIPT</span>' : r.type === 'web' ? '<span class="badge-web">WEB</span>' : r.type === 'wiki' ? '<span class="badge-wiki">WIKI</span>' : '<span class="badge-pdf">PDF</span>';
|
||||||
return '<tr><td>' + r.title + '</td><td>' + badge + '</td><td>' +
|
return '<tr><td>' + r.title + '</td><td>' + badge + '</td><td>' +
|
||||||
r.concepts + '</td><td>' + r.vectors + '</td></tr>';
|
r.concepts + '</td><td>' + r.vectors + '</td></tr>';
|
||||||
}).join('');
|
}).join('');
|
||||||
|
|
|
||||||
147
static/js/kiwix.js
Normal file
147
static/js/kiwix.js
Normal file
|
|
@ -0,0 +1,147 @@
|
||||||
|
/* RECON Kiwix Dashboard JS */
|
||||||
|
(function() {
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
function loadKiwixDashboard() {
|
||||||
|
return RECON.fetchJSON('/api/kiwix/sources').then(function(data) {
|
||||||
|
// Update stat cards
|
||||||
|
var t = data.totals || {};
|
||||||
|
RECON.set('kx-sources', RECON.fmt(t.sources));
|
||||||
|
RECON.set('kx-articles', RECON.fmt(t.articles));
|
||||||
|
RECON.set('kx-processed', RECON.fmt(t.processed));
|
||||||
|
RECON.set('kx-pipeline', RECON.fmt(t.in_pipeline));
|
||||||
|
|
||||||
|
// Kiwix-serve status dot
|
||||||
|
var ks = data.kiwix_serve || {};
|
||||||
|
var dot = document.getElementById('svc-kiwix-serve');
|
||||||
|
dot.className = 'svc-dot ' + (ks.status === 'active' ? 'active' : 'inactive');
|
||||||
|
|
||||||
|
// ZIM table
|
||||||
|
var sources = data.sources || [];
|
||||||
|
var html = '';
|
||||||
|
sources.forEach(function(s) {
|
||||||
|
var es = s.effective_status || s.status;
|
||||||
|
var pipe = s.pipeline || {};
|
||||||
|
var pipeComplete = pipe.complete || 0;
|
||||||
|
var pipeTotal = 0;
|
||||||
|
for (var k in pipe) pipeTotal += pipe[k];
|
||||||
|
var pctDone = pipeTotal > 0 ? (pipeComplete / pipeTotal * 100).toFixed(1) : 0;
|
||||||
|
var statusBadge = es === 'complete' ? '<span class="badge-complete">COMPLETE</span>' :
|
||||||
|
es === 'processing' ? '<span class="badge-processing">PROCESSING</span>' :
|
||||||
|
es === 'extracting' ? '<span class="badge-extracting">EXTRACTING</span>' :
|
||||||
|
'<span class="badge-detected">DETECTED</span>';
|
||||||
|
// Derive browse URL from zim_filename
|
||||||
|
var zimName = s.zim_filename.replace(/_(?:maxi|mini|nopic)_[\d-]+\.zim$/, '');
|
||||||
|
var browseUrl = 'https://wiki.echo6.co/' + zimName + '/';
|
||||||
|
// Toggle switch
|
||||||
|
var checked = s.ingest_enabled ? ' checked' : '';
|
||||||
|
var toggle = '<label class="toggle-switch"><input type="checkbox"' + checked +
|
||||||
|
' onchange="KIWIX.toggleIngest(' + s.id + ', this.checked)">' +
|
||||||
|
'<span class="toggle-slider"></span></label>';
|
||||||
|
|
||||||
|
html += '<tr>' +
|
||||||
|
'<td><strong>' + (s.title || s.zim_filename) + '</strong>' +
|
||||||
|
'<div class="text-small text-muted">' + s.zim_filename + '</div></td>' +
|
||||||
|
'<td>' + (s.language || '\u2014') + '</td>' +
|
||||||
|
'<td>' + RECON.fmt(s.article_count) + '</td>' +
|
||||||
|
'<td>' + (es === 'complete' && pipeComplete > 0 ?
|
||||||
|
RECON.fmt(pipeComplete) + ' in Qdrant' :
|
||||||
|
es === 'processing' ?
|
||||||
|
RECON.fmt(pipeComplete) + ' / ' + RECON.fmt(pipeTotal) + ' in Qdrant (' + pctDone + '%)' :
|
||||||
|
es === 'extracting' ?
|
||||||
|
RECON.fmt(s.processed_count) + ' / ' + RECON.fmt(s.article_count) + ' extracted' :
|
||||||
|
'\u2014') + '</td>' +
|
||||||
|
'<td>' + statusBadge + '</td>' +
|
||||||
|
'<td>' + toggle + '</td>' +
|
||||||
|
'<td><a href="' + browseUrl + '" target="_blank">Browse</a></td>' +
|
||||||
|
'<td><button class="btn btn-danger" onclick="KIWIX.remove(' + s.id + ', \'' + (s.title || s.zim_filename).replace(/'/g, "\\'") + '\')">Remove</button></td>' +
|
||||||
|
'</tr>';
|
||||||
|
});
|
||||||
|
if (!html) html = '<tr><td colspan="8" class="text-muted">No ZIM sources detected</td></tr>';
|
||||||
|
RECON.setHTML('kx-table-body', html);
|
||||||
|
}).catch(function(err) {
|
||||||
|
console.error('Kiwix dashboard error:', err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function toggleIngest(id, enabled) {
|
||||||
|
RECON.postJSON('/api/kiwix/toggle-ingest/' + id, {enabled: enabled}).then(function(data) {
|
||||||
|
if (data.ok) loadKiwixDashboard();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function removeSource(id, title) {
|
||||||
|
if (!confirm('Remove "' + title + '"?\n\nThis will delete the ZIM file, all ingested documents, and associated vectors from Qdrant. This cannot be undone.')) return;
|
||||||
|
RECON.postJSON('/api/kiwix/remove/' + id).then(function(data) {
|
||||||
|
if (data.ok) {
|
||||||
|
var r = data.results || {};
|
||||||
|
alert('Removed: ' + r.docs_deleted + ' docs, ~' + r.vectors_deleted + ' vector batches deleted, file ' + (r.file_deleted ? 'deleted' : 'not found'));
|
||||||
|
loadKiwixDashboard();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function triggerIngest(id) {
|
||||||
|
RECON.postJSON('/api/kiwix/trigger-ingest/' + id).then(function(data) {
|
||||||
|
if (data.ok) loadKiwixDashboard();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function uploadZim() {
|
||||||
|
var input = document.getElementById('kx-file-input');
|
||||||
|
var file = input.files[0];
|
||||||
|
if (!file) return;
|
||||||
|
|
||||||
|
var statusEl = document.getElementById('kx-upload-status');
|
||||||
|
var progressDiv = document.getElementById('kx-upload-progress');
|
||||||
|
var progressBar = document.getElementById('kx-progress-bar');
|
||||||
|
var progressText = document.getElementById('kx-progress-text');
|
||||||
|
|
||||||
|
statusEl.textContent = 'Uploading ' + file.name + '...';
|
||||||
|
progressDiv.style.display = 'block';
|
||||||
|
|
||||||
|
var formData = new FormData();
|
||||||
|
formData.append('file', file);
|
||||||
|
|
||||||
|
var xhr = new XMLHttpRequest();
|
||||||
|
xhr.open('POST', '/api/kiwix/upload', true);
|
||||||
|
|
||||||
|
xhr.upload.onprogress = function(e) {
|
||||||
|
if (e.lengthComputable) {
|
||||||
|
var pct = (e.loaded / e.total * 100).toFixed(1);
|
||||||
|
progressBar.style.width = pct + '%';
|
||||||
|
progressText.textContent = RECON.fmtBytes(e.loaded) + ' / ' + RECON.fmtBytes(e.total) + ' (' + pct + '%)';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
xhr.onload = function() {
|
||||||
|
if (xhr.status === 200) {
|
||||||
|
var resp = JSON.parse(xhr.responseText);
|
||||||
|
statusEl.textContent = resp.ok ? 'Upload complete: ' + resp.filename : 'Error: ' + (resp.error || 'Unknown');
|
||||||
|
progressBar.style.width = '100%';
|
||||||
|
progressBar.style.background = resp.ok ? '#16a34a' : '#dc2626';
|
||||||
|
if (resp.ok) loadKiwixDashboard();
|
||||||
|
} else {
|
||||||
|
statusEl.textContent = 'Upload failed (HTTP ' + xhr.status + ')';
|
||||||
|
progressBar.style.background = '#dc2626';
|
||||||
|
}
|
||||||
|
input.value = '';
|
||||||
|
};
|
||||||
|
|
||||||
|
xhr.onerror = function() {
|
||||||
|
statusEl.textContent = 'Upload failed (network error)';
|
||||||
|
progressBar.style.background = '#dc2626';
|
||||||
|
input.value = '';
|
||||||
|
};
|
||||||
|
|
||||||
|
xhr.send(formData);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expose for inline onclick
|
||||||
|
window.KIWIX = { toggleIngest: toggleIngest, triggerIngest: triggerIngest, remove: removeSource };
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
RECON.startRefresh(loadKiwixDashboard, 30000);
|
||||||
|
document.getElementById('kx-file-input').addEventListener('change', uploadZim);
|
||||||
|
});
|
||||||
|
})();
|
||||||
173
static/js/scraper.js
Normal file
173
static/js/scraper.js
Normal file
|
|
@ -0,0 +1,173 @@
|
||||||
|
/* RECON Scraper Dashboard JS */
|
||||||
|
(function() {
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
function loadJobs() {
|
||||||
|
return RECON.fetchJSON('/api/scraper/jobs').then(function(data) {
|
||||||
|
var jobs = data.jobs || [];
|
||||||
|
|
||||||
|
// Stats
|
||||||
|
var total = jobs.length;
|
||||||
|
var active = 0, complete = 0, failed = 0;
|
||||||
|
jobs.forEach(function(j) {
|
||||||
|
if (j.status === 'complete') complete++;
|
||||||
|
else if (j.status === 'failed' || j.status === 'cancelled') failed++;
|
||||||
|
else if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') active++;
|
||||||
|
});
|
||||||
|
RECON.set('sc-total', RECON.fmt(total));
|
||||||
|
RECON.set('sc-active', RECON.fmt(active));
|
||||||
|
RECON.set('sc-complete', RECON.fmt(complete));
|
||||||
|
RECON.set('sc-failed', RECON.fmt(failed));
|
||||||
|
|
||||||
|
// Show/hide Clear Failed button
|
||||||
|
var clearBtn = document.getElementById('sc-clear-btn');
|
||||||
|
if (clearBtn) clearBtn.style.display = failed > 0 ? '' : 'none';
|
||||||
|
|
||||||
|
// Table
|
||||||
|
var html = '';
|
||||||
|
jobs.forEach(function(j) {
|
||||||
|
var badge = statusBadge(j.status);
|
||||||
|
var pages = j.page_count ? RECON.fmt(j.page_count) : '\u2014';
|
||||||
|
var zim = j.zim_filename ?
|
||||||
|
'<span class="text-small">' + j.zim_filename + '</span>' : '\u2014';
|
||||||
|
var actions = '';
|
||||||
|
|
||||||
|
if (j.status === 'scraping' || j.status === 'registering' || j.status === 'pending') {
|
||||||
|
actions = '<button class="btn btn-danger" onclick="SCRAPER.cancel(' + j.id + ')">Cancel</button>';
|
||||||
|
} else if (j.status === 'failed' || j.status === 'cancelled') {
|
||||||
|
actions = '<button class="btn" onclick="SCRAPER.retry(' + j.id + ')">Retry</button> ' +
|
||||||
|
'<button class="btn btn-danger" onclick="SCRAPER.remove(' + j.id + ')">Delete</button>';
|
||||||
|
} else if (j.status === 'complete') {
|
||||||
|
actions = '<button class="btn btn-danger" onclick="SCRAPER.remove(' + j.id + ')">Delete</button>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Truncate URL for display
|
||||||
|
var displayUrl = j.url.length > 40 ? j.url.substring(0, 40) + '\u2026' : j.url;
|
||||||
|
|
||||||
|
html += '<tr>' +
|
||||||
|
'<td>' + j.id + '</td>' +
|
||||||
|
'<td><a href="' + escHtml(j.url) + '" target="_blank" title="' + escHtml(j.url) + '">' + escHtml(displayUrl) + '</a></td>' +
|
||||||
|
'<td>' + escHtml(j.title || '\u2014') + '</td>' +
|
||||||
|
'<td>' + pages + '</td>' +
|
||||||
|
'<td>' + badge + errorTooltip(j) + '</td>' +
|
||||||
|
'<td>' + zim + '</td>' +
|
||||||
|
'<td>' + actions + '</td>' +
|
||||||
|
'</tr>';
|
||||||
|
});
|
||||||
|
if (!html) html = '<tr><td colspan="7" class="text-muted">No scrape jobs</td></tr>';
|
||||||
|
RECON.setHTML('sc-table-body', html);
|
||||||
|
}).catch(function(err) {
|
||||||
|
console.error('Scraper dashboard error:', err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function statusBadge(status) {
|
||||||
|
var map = {
|
||||||
|
'pending': '<span class="badge-detected">PENDING</span>',
|
||||||
|
'scraping': '<span class="badge-processing">SCRAPING</span>',
|
||||||
|
'registering': '<span class="badge-processing">REGISTERING</span>',
|
||||||
|
'complete': '<span class="badge-complete">COMPLETE</span>',
|
||||||
|
'failed': '<span class="badge-failed">FAILED</span>',
|
||||||
|
'cancelled': '<span class="badge-detected">CANCELLED</span>'
|
||||||
|
};
|
||||||
|
return map[status] || '<span class="badge-detected">' + (status || 'UNKNOWN').toUpperCase() + '</span>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function errorTooltip(job) {
|
||||||
|
if (!job.error_message) return '';
|
||||||
|
var short = job.error_message.length > 80 ?
|
||||||
|
job.error_message.substring(0, 80) + '\u2026' : job.error_message;
|
||||||
|
return '<div class="text-small text-muted" style="max-width:200px;word-break:break-all;" title="' +
|
||||||
|
escHtml(job.error_message) + '">' + escHtml(short) + '</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
function escHtml(str) {
|
||||||
|
if (!str) return '';
|
||||||
|
return str.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"').replace(/'/g, ''');
|
||||||
|
}
|
||||||
|
|
||||||
|
function submit(e) {
|
||||||
|
e.preventDefault();
|
||||||
|
var url = document.getElementById('sf-url').value.trim();
|
||||||
|
if (!url) return false;
|
||||||
|
|
||||||
|
var body = { url: url };
|
||||||
|
var title = document.getElementById('sf-title').value.trim();
|
||||||
|
var lang = document.getElementById('sf-lang').value;
|
||||||
|
var category = document.getElementById('sf-category').value.trim();
|
||||||
|
if (title) body.title = title;
|
||||||
|
if (lang) body.language = lang;
|
||||||
|
if (category) body.category = category;
|
||||||
|
|
||||||
|
var btn = document.getElementById('sf-submit-btn');
|
||||||
|
var feedback = document.getElementById('sf-feedback');
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Submitting...';
|
||||||
|
|
||||||
|
RECON.postJSON('/api/scraper/submit', body).then(function(data) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = 'Submit';
|
||||||
|
if (data.ok) {
|
||||||
|
feedback.style.display = 'block';
|
||||||
|
feedback.style.color = '#00ff41';
|
||||||
|
feedback.textContent = 'Job #' + data.job_id + ' submitted successfully';
|
||||||
|
document.getElementById('sf-url').value = '';
|
||||||
|
document.getElementById('sf-title').value = '';
|
||||||
|
document.getElementById('sf-category').value = '';
|
||||||
|
setTimeout(function() { feedback.style.display = 'none'; }, 4000);
|
||||||
|
loadJobs();
|
||||||
|
} else {
|
||||||
|
feedback.style.display = 'block';
|
||||||
|
feedback.style.color = '#ff4444';
|
||||||
|
feedback.textContent = 'Error: ' + (data.error || 'Unknown error');
|
||||||
|
}
|
||||||
|
}).catch(function(err) {
|
||||||
|
btn.disabled = false;
|
||||||
|
btn.textContent = 'Submit';
|
||||||
|
feedback.style.display = 'block';
|
||||||
|
feedback.style.color = '#ff4444';
|
||||||
|
feedback.textContent = 'Network error: ' + err.message;
|
||||||
|
});
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cancel(jobId) {
|
||||||
|
if (!confirm('Cancel job #' + jobId + '?')) return;
|
||||||
|
RECON.postJSON('/api/scraper/cancel/' + jobId).then(function(data) {
|
||||||
|
if (data.ok) loadJobs();
|
||||||
|
else alert('Error: ' + (data.error || 'Unknown'));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function retry(jobId) {
|
||||||
|
RECON.postJSON('/api/scraper/retry/' + jobId).then(function(data) {
|
||||||
|
if (data.ok) loadJobs();
|
||||||
|
else alert('Error: ' + (data.error || 'Unknown'));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function remove(jobId) {
|
||||||
|
if (!confirm('Delete job #' + jobId + '? This cannot be undone.')) return;
|
||||||
|
RECON.postJSON('/api/scraper/delete/' + jobId).then(function(data) {
|
||||||
|
if (data.ok) loadJobs();
|
||||||
|
else alert('Error: ' + (data.error || 'Unknown'));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearFailed() {
|
||||||
|
if (!confirm('Delete all failed and cancelled jobs?')) return;
|
||||||
|
RECON.postJSON('/api/scraper/clear-failed').then(function(data) {
|
||||||
|
if (data.ok) loadJobs();
|
||||||
|
else alert('Error: ' + (data.error || 'Unknown'));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expose for inline onclick
|
||||||
|
window.SCRAPER = { submit: submit, cancel: cancel, retry: retry, remove: remove, clearFailed: clearFailed };
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
RECON.startRefresh(loadJobs, 10000);
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
|
@ -19,6 +19,7 @@
|
||||||
<div class="nav-domain">
|
<div class="nav-domain">
|
||||||
<a href="/"{% if domain == 'knowledge' %} class="active"{% endif %}>Knowledge</a>
|
<a href="/"{% if domain == 'knowledge' %} class="active"{% endif %}>Knowledge</a>
|
||||||
<a href="/peertube"{% if domain == 'peertube' %} class="active"{% endif %}>PeerTube</a>
|
<a href="/peertube"{% if domain == 'peertube' %} class="active"{% endif %}>PeerTube</a>
|
||||||
|
<a href="/kiwix"{% if domain == 'kiwix' %} class="active"{% endif %}>Kiwix</a>
|
||||||
<a href="/search"{% if domain == 'search' %} class="active"{% endif %}>Search</a>
|
<a href="/search"{% if domain == 'search' %} class="active"{% endif %}>Search</a>
|
||||||
<a href="/settings/keys"{% if domain == 'settings' %} class="active"{% endif %}>Settings</a>
|
<a href="/settings/keys"{% if domain == 'settings' %} class="active"{% endif %}>Settings</a>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
48
templates/kiwix/dashboard.html
Normal file
48
templates/kiwix/dashboard.html
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
{% block content %}
|
||||||
|
<div id="kiwix-dashboard">
|
||||||
|
<!-- Stats row: 4 cards -->
|
||||||
|
<div class="stat-grid" style="grid-template-columns:repeat(4, 1fr);">
|
||||||
|
<div class="stat-card"><div class="label">ZIM Sources</div><div class="value" id="kx-sources">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">Total Articles</div><div class="value" id="kx-articles">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">Processed</div><div class="value" id="kx-processed">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">In Pipeline</div><div class="value" id="kx-pipeline">—</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Kiwix-serve status -->
|
||||||
|
<div class="svc-row">
|
||||||
|
<div class="svc-item"><span class="svc-dot unknown" id="svc-kiwix-serve"></span>Kiwix-Serve</div>
|
||||||
|
<div class="svc-item"><a href="https://wiki.echo6.co" target="_blank" class="text-muted" id="kx-browse-link">Browse Wiki Library</a></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ZIM Library Table -->
|
||||||
|
<div class="panel">
|
||||||
|
<h3 class="section-title" style="margin-bottom:12px;">ZIM Library</h3>
|
||||||
|
<table class="data-table" id="kx-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>Title</th><th>Language</th><th>Articles</th><th>Progress</th><th>Status</th><th>Ingest</th><th>Browse</th><th></th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="kx-table-body">
|
||||||
|
<tr><td colspan="8" class="text-muted">Loading...</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Upload Section -->
|
||||||
|
<div class="panel">
|
||||||
|
<h3 class="section-title" style="margin-bottom:12px;">Upload ZIM File</h3>
|
||||||
|
<div class="upload-area" id="kx-upload-area">
|
||||||
|
<input type="file" id="kx-file-input" accept=".zim" style="display:none">
|
||||||
|
<button class="btn" onclick="document.getElementById('kx-file-input').click()">Choose .zim file</button>
|
||||||
|
<span id="kx-upload-status" class="text-muted" style="margin-left:12px;"></span>
|
||||||
|
</div>
|
||||||
|
<div id="kx-upload-progress" style="display:none; margin-top:8px;">
|
||||||
|
<div class="pipeline-bar"><div id="kx-progress-bar" class="segment" style="width:0%;background:#7c3aed;"></div></div>
|
||||||
|
<span class="text-small text-muted" id="kx-progress-text"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
|
{% block scripts %}
|
||||||
|
<script src="/static/js/kiwix.js"></script>
|
||||||
|
{% endblock %}
|
||||||
84
templates/kiwix/scraper.html
Normal file
84
templates/kiwix/scraper.html
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
{% block content %}
|
||||||
|
<div id="scraper-page">
|
||||||
|
<!-- Submit Form -->
|
||||||
|
<div class="panel">
|
||||||
|
<h3 class="section-title" style="margin-bottom:12px;">Submit Scrape Job</h3>
|
||||||
|
<form id="scraper-form" onsubmit="return SCRAPER.submit(event)">
|
||||||
|
<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:12px;">
|
||||||
|
<div>
|
||||||
|
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">URL *</label>
|
||||||
|
<input type="url" id="sf-url" placeholder="https://example.com/" required
|
||||||
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Title</label>
|
||||||
|
<input type="text" id="sf-title" placeholder="Optional display title"
|
||||||
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div style="display:grid;grid-template-columns:1fr 1fr auto;gap:12px;align-items:end;">
|
||||||
|
<div>
|
||||||
|
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Language</label>
|
||||||
|
<select id="sf-lang"
|
||||||
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
|
<option value="eng" selected>English</option>
|
||||||
|
<option value="spa">Spanish</option>
|
||||||
|
<option value="fra">French</option>
|
||||||
|
<option value="deu">German</option>
|
||||||
|
<option value="por">Portuguese</option>
|
||||||
|
<option value="rus">Russian</option>
|
||||||
|
<option value="jpn">Japanese</option>
|
||||||
|
<option value="zho">Chinese</option>
|
||||||
|
<option value="mul">Multilingual</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label class="text-small text-muted" style="display:block;margin-bottom:4px;">Category</label>
|
||||||
|
<input type="text" id="sf-category" placeholder="Optional"
|
||||||
|
style="width:100%;padding:8px 12px;background:var(--bg-secondary);border:1px solid var(--border);color:var(--text-primary);border-radius:var(--radius);font-family:inherit;font-size:13px;">
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<button type="submit" class="btn" id="sf-submit-btn">Submit</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="sf-feedback" style="margin-top:8px;font-size:12px;display:none;"></div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Stats row -->
|
||||||
|
<div class="stat-grid" style="grid-template-columns:repeat(4, 1fr);">
|
||||||
|
<div class="stat-card"><div class="label">Total Jobs</div><div class="value" id="sc-total">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">Active</div><div class="value" id="sc-active">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">Complete</div><div class="value" id="sc-complete">—</div></div>
|
||||||
|
<div class="stat-card"><div class="label">Failed</div><div class="value" id="sc-failed">—</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Jobs Table -->
|
||||||
|
<div class="panel">
|
||||||
|
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;">
|
||||||
|
<h3 class="section-title" style="margin:0;">Scrape Jobs</h3>
|
||||||
|
<button class="btn btn-danger" onclick="SCRAPER.clearFailed()" id="sc-clear-btn" style="display:none;">Clear Failed</button>
|
||||||
|
</div>
|
||||||
|
<table class="data-table" id="sc-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<th>URL</th>
|
||||||
|
<th>Title</th>
|
||||||
|
<th>Pages</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>ZIM</th>
|
||||||
|
<th></th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="sc-table-body">
|
||||||
|
<tr><td colspan="7" class="text-muted">Loading...</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
|
{% block scripts %}
|
||||||
|
<script src="/static/js/scraper.js"></script>
|
||||||
|
{% endblock %}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue