Add langdetect language filter to enricher + purge non-English ZIM articles

- Install langdetect package for content-level language detection - Add _check_language() to enricher.py: reads first 1500 chars of first page, detects language via langdetect, skips if not in allowed list - Configurable via config.yaml pipeline.language_filter and pipeline.allowed_languages (default: en only) - Catches non-English content from ANY source (PDF, web, ZIM, PeerTube) before burning Gemini API quota on enrichment - Add scan_zims retry logic (3 attempts, 2s delay) for upload handler - Purged 6,483 stale non-English zim_articles rows from DB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-17 14:37:13 +00:00 · 2026-04-17 14:37:13 +00:00 · 6f2a1d206e
commit 6f2a1d206e
parent 501004ecf1
3 changed files with 67 additions and 6 deletions
--- a/config.yaml
+++ b/config.yaml
@ -440,3 +440,7 @@ pipeline:
    text: text_processor
  # mtime stability threshold for picking up files from acquired/
  mtime_stability_seconds: 10
  # Language filter: skip non-English content before Gemini enrichment
  language_filter: true           # Enable langdetect-based filtering
  allowed_languages:              # ISO 639-1 codes allowed through enrichment
    - en
--- a/lib/api.py
+++ b/lib/api.py
@ -2030,12 +2030,16 @@ def api_kiwix_upload():
    except Exception as e:
        logger.warning(f"kiwix-manage add failed: {e}")
-    # Scan for new entry
+    # Scan for new entry (retry — monitorLibrary may need a moment to reload)
-    try:
+    import time as _time
    from .zim_monitor import scan_zims
    for attempt in range(3):
        try:
            scan_zims()
            break
        except Exception as e:
-        logger.warning(f"scan_zims after upload failed: {e}")
+            logger.warning(f"scan_zims attempt {attempt+1} failed: {e}")
            _time.sleep(2)
    # Refresh cache
    try:
--- a/lib/enricher.py
+++ b/lib/enricher.py
@ -27,6 +27,15 @@ from .utils import get_config, setup_logging
 from .status import StatusDB
 from .utils import resolve_text_dir
 try:
    from langdetect import detect as _detect_lang
    from langdetect.lang_detect_exception import LangDetectException
    _HAS_LANGDETECT = True
 except ImportError:
    _HAS_LANGDETECT = False
 ALLOWED_LANGUAGES = {'en'}  # Default: English only
 logger = setup_logging('recon.enricher')
 # Docs stuck in "enriching" longer than this get reset to "extracted" for retry
@ -341,6 +350,42 @@ def validate_and_fix_concepts(concepts, key, config):
    return concepts
 def _check_language(text_dir, config):
    """Check language of document text. Returns (is_allowed, detected_lang).
    Reads first 1000 chars from first page file and uses langdetect.
    Returns (True, lang) if language is allowed, (False, lang) if not.
    Falls back to (True, 'unknown') if detection fails (benefit of the doubt).
    """
    if not _HAS_LANGDETECT:
        return True, 'unknown'
    # Check if language filter is enabled in config
    pipeline_cfg = config.get('pipeline', {})
    if not pipeline_cfg.get('language_filter', True):
        return True, 'disabled'
    allowed = set(pipeline_cfg.get('allowed_languages', ['en']))
    # Read first page for detection
    page_files = sorted([f for f in os.listdir(text_dir)
                         if f.startswith('page_') and f.endswith('.txt')])
    if not page_files:
        return True, 'no_pages'
    try:
        with open(os.path.join(text_dir, page_files[0]), encoding='utf-8') as f:
            sample = f.read(1500)
        if len(sample.strip()) < 50:
            return True, 'too_short'
        lang = _detect_lang(sample)
        return (lang in allowed), lang
    except LangDetectException:
        return True, 'detection_failed'
    except Exception:
        return True, 'error'
 def enrich_single(file_hash, db, config, key_rotator):
    doc = db.get_document(file_hash)
    if not doc:
@ -359,6 +404,14 @@ def enrich_single(file_hash, db, config, key_rotator):
        db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
        return False
    # Language gate: skip non-English documents before burning Gemini quota
    lang_ok, detected_lang = _check_language(text_dir, config)
    if not lang_ok:
        logger.info(f"Skipping {file_hash[:12]}... detected language '{detected_lang}' "
                     f"(allowed: {config.get('pipeline', {}).get('allowed_languages', ['en'])})")
        db.mark_failed(file_hash, f"Language filter: detected '{detected_lang}', not in allowed list")
        return False
    db.update_status(file_hash, 'enriching')
    try: