Add langdetect language filter to enricher + purge non-English ZIM articles

- Install langdetect package for content-level language detection
- Add _check_language() to enricher.py: reads first 1500 chars of first
  page, detects language via langdetect, skips if not in allowed list
- Configurable via config.yaml pipeline.language_filter and
  pipeline.allowed_languages (default: en only)
- Catches non-English content from ANY source (PDF, web, ZIM, PeerTube)
  before burning Gemini API quota on enrichment
- Add scan_zims retry logic (3 attempts, 2s delay) for upload handler
- Purged 6,483 stale non-English zim_articles rows from DB

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-17 14:37:13 +00:00
commit 6f2a1d206e
3 changed files with 67 additions and 6 deletions

View file

@ -440,3 +440,7 @@ pipeline:
text: text_processor text: text_processor
# mtime stability threshold for picking up files from acquired/ # mtime stability threshold for picking up files from acquired/
mtime_stability_seconds: 10 mtime_stability_seconds: 10
# Language filter: skip non-English content before Gemini enrichment
language_filter: true # Enable langdetect-based filtering
allowed_languages: # ISO 639-1 codes allowed through enrichment
- en

View file

@ -2030,12 +2030,16 @@ def api_kiwix_upload():
except Exception as e: except Exception as e:
logger.warning(f"kiwix-manage add failed: {e}") logger.warning(f"kiwix-manage add failed: {e}")
# Scan for new entry # Scan for new entry (retry — monitorLibrary may need a moment to reload)
try: import time as _time
from .zim_monitor import scan_zims from .zim_monitor import scan_zims
for attempt in range(3):
try:
scan_zims() scan_zims()
break
except Exception as e: except Exception as e:
logger.warning(f"scan_zims after upload failed: {e}") logger.warning(f"scan_zims attempt {attempt+1} failed: {e}")
_time.sleep(2)
# Refresh cache # Refresh cache
try: try:

View file

@ -27,6 +27,15 @@ from .utils import get_config, setup_logging
from .status import StatusDB from .status import StatusDB
from .utils import resolve_text_dir from .utils import resolve_text_dir
try:
from langdetect import detect as _detect_lang
from langdetect.lang_detect_exception import LangDetectException
_HAS_LANGDETECT = True
except ImportError:
_HAS_LANGDETECT = False
ALLOWED_LANGUAGES = {'en'} # Default: English only
logger = setup_logging('recon.enricher') logger = setup_logging('recon.enricher')
# Docs stuck in "enriching" longer than this get reset to "extracted" for retry # Docs stuck in "enriching" longer than this get reset to "extracted" for retry
@ -341,6 +350,42 @@ def validate_and_fix_concepts(concepts, key, config):
return concepts return concepts
def _check_language(text_dir, config):
"""Check language of document text. Returns (is_allowed, detected_lang).
Reads first 1000 chars from first page file and uses langdetect.
Returns (True, lang) if language is allowed, (False, lang) if not.
Falls back to (True, 'unknown') if detection fails (benefit of the doubt).
"""
if not _HAS_LANGDETECT:
return True, 'unknown'
# Check if language filter is enabled in config
pipeline_cfg = config.get('pipeline', {})
if not pipeline_cfg.get('language_filter', True):
return True, 'disabled'
allowed = set(pipeline_cfg.get('allowed_languages', ['en']))
# Read first page for detection
page_files = sorted([f for f in os.listdir(text_dir)
if f.startswith('page_') and f.endswith('.txt')])
if not page_files:
return True, 'no_pages'
try:
with open(os.path.join(text_dir, page_files[0]), encoding='utf-8') as f:
sample = f.read(1500)
if len(sample.strip()) < 50:
return True, 'too_short'
lang = _detect_lang(sample)
return (lang in allowed), lang
except LangDetectException:
return True, 'detection_failed'
except Exception:
return True, 'error'
def enrich_single(file_hash, db, config, key_rotator): def enrich_single(file_hash, db, config, key_rotator):
doc = db.get_document(file_hash) doc = db.get_document(file_hash)
if not doc: if not doc:
@ -359,6 +404,14 @@ def enrich_single(file_hash, db, config, key_rotator):
db.mark_failed(file_hash, f"Text directory not found: {text_dir}") db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
return False return False
# Language gate: skip non-English documents before burning Gemini quota
lang_ok, detected_lang = _check_language(text_dir, config)
if not lang_ok:
logger.info(f"Skipping {file_hash[:12]}... detected language '{detected_lang}' "
f"(allowed: {config.get('pipeline', {}).get('allowed_languages', ['en'])})")
db.mark_failed(file_hash, f"Language filter: detected '{detected_lang}', not in allowed list")
return False
db.update_status(file_hash, 'enriching') db.update_status(file_hash, 'enriching')
try: try: