mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Add langdetect language filter to enricher + purge non-English ZIM articles
- Install langdetect package for content-level language detection - Add _check_language() to enricher.py: reads first 1500 chars of first page, detects language via langdetect, skips if not in allowed list - Configurable via config.yaml pipeline.language_filter and pipeline.allowed_languages (default: en only) - Catches non-English content from ANY source (PDF, web, ZIM, PeerTube) before burning Gemini API quota on enrichment - Add scan_zims retry logic (3 attempts, 2s delay) for upload handler - Purged 6,483 stale non-English zim_articles rows from DB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
501004ecf1
commit
6f2a1d206e
3 changed files with 67 additions and 6 deletions
16
lib/api.py
16
lib/api.py
|
|
@ -2030,12 +2030,16 @@ def api_kiwix_upload():
|
|||
except Exception as e:
|
||||
logger.warning(f"kiwix-manage add failed: {e}")
|
||||
|
||||
# Scan for new entry
|
||||
try:
|
||||
from .zim_monitor import scan_zims
|
||||
scan_zims()
|
||||
except Exception as e:
|
||||
logger.warning(f"scan_zims after upload failed: {e}")
|
||||
# Scan for new entry (retry — monitorLibrary may need a moment to reload)
|
||||
import time as _time
|
||||
from .zim_monitor import scan_zims
|
||||
for attempt in range(3):
|
||||
try:
|
||||
scan_zims()
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"scan_zims attempt {attempt+1} failed: {e}")
|
||||
_time.sleep(2)
|
||||
|
||||
# Refresh cache
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue