diff --git a/config.yaml b/config.yaml index 3e185f8..4b147fd 100644 --- a/config.yaml +++ b/config.yaml @@ -440,3 +440,7 @@ pipeline: text: text_processor # mtime stability threshold for picking up files from acquired/ mtime_stability_seconds: 10 + # Language filter: skip non-English content before Gemini enrichment + language_filter: true # Enable langdetect-based filtering + allowed_languages: # ISO 639-1 codes allowed through enrichment + - en diff --git a/lib/api.py b/lib/api.py index 980578b..653c80f 100644 --- a/lib/api.py +++ b/lib/api.py @@ -2030,12 +2030,16 @@ def api_kiwix_upload(): except Exception as e: logger.warning(f"kiwix-manage add failed: {e}") - # Scan for new entry - try: - from .zim_monitor import scan_zims - scan_zims() - except Exception as e: - logger.warning(f"scan_zims after upload failed: {e}") + # Scan for new entry (retry — monitorLibrary may need a moment to reload) + import time as _time + from .zim_monitor import scan_zims + for attempt in range(3): + try: + scan_zims() + break + except Exception as e: + logger.warning(f"scan_zims attempt {attempt+1} failed: {e}") + _time.sleep(2) # Refresh cache try: diff --git a/lib/enricher.py b/lib/enricher.py index d9540aa..e1e583c 100644 --- a/lib/enricher.py +++ b/lib/enricher.py @@ -27,6 +27,15 @@ from .utils import get_config, setup_logging from .status import StatusDB from .utils import resolve_text_dir +try: + from langdetect import detect as _detect_lang + from langdetect.lang_detect_exception import LangDetectException + _HAS_LANGDETECT = True +except ImportError: + _HAS_LANGDETECT = False + +ALLOWED_LANGUAGES = {'en'} # Default: English only + logger = setup_logging('recon.enricher') # Docs stuck in "enriching" longer than this get reset to "extracted" for retry @@ -341,6 +350,42 @@ def validate_and_fix_concepts(concepts, key, config): return concepts +def _check_language(text_dir, config): + """Check language of document text. Returns (is_allowed, detected_lang). + + Reads first 1000 chars from first page file and uses langdetect. + Returns (True, lang) if language is allowed, (False, lang) if not. + Falls back to (True, 'unknown') if detection fails (benefit of the doubt). + """ + if not _HAS_LANGDETECT: + return True, 'unknown' + + # Check if language filter is enabled in config + pipeline_cfg = config.get('pipeline', {}) + if not pipeline_cfg.get('language_filter', True): + return True, 'disabled' + + allowed = set(pipeline_cfg.get('allowed_languages', ['en'])) + + # Read first page for detection + page_files = sorted([f for f in os.listdir(text_dir) + if f.startswith('page_') and f.endswith('.txt')]) + if not page_files: + return True, 'no_pages' + + try: + with open(os.path.join(text_dir, page_files[0]), encoding='utf-8') as f: + sample = f.read(1500) + if len(sample.strip()) < 50: + return True, 'too_short' + lang = _detect_lang(sample) + return (lang in allowed), lang + except LangDetectException: + return True, 'detection_failed' + except Exception: + return True, 'error' + + def enrich_single(file_hash, db, config, key_rotator): doc = db.get_document(file_hash) if not doc: @@ -359,6 +404,14 @@ def enrich_single(file_hash, db, config, key_rotator): db.mark_failed(file_hash, f"Text directory not found: {text_dir}") return False + # Language gate: skip non-English documents before burning Gemini quota + lang_ok, detected_lang = _check_language(text_dir, config) + if not lang_ok: + logger.info(f"Skipping {file_hash[:12]}... detected language '{detected_lang}' " + f"(allowed: {config.get('pipeline', {}).get('allowed_languages', ['en'])})") + db.mark_failed(file_hash, f"Language filter: detected '{detected_lang}', not in allowed list") + return False + db.update_status(file_hash, 'enriching') try: