mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Add langdetect language filter to enricher + purge non-English ZIM articles
- Install langdetect package for content-level language detection - Add _check_language() to enricher.py: reads first 1500 chars of first page, detects language via langdetect, skips if not in allowed list - Configurable via config.yaml pipeline.language_filter and pipeline.allowed_languages (default: en only) - Catches non-English content from ANY source (PDF, web, ZIM, PeerTube) before burning Gemini API quota on enrichment - Add scan_zims retry logic (3 attempts, 2s delay) for upload handler - Purged 6,483 stale non-English zim_articles rows from DB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
501004ecf1
commit
6f2a1d206e
3 changed files with 67 additions and 6 deletions
|
|
@ -440,3 +440,7 @@ pipeline:
|
||||||
text: text_processor
|
text: text_processor
|
||||||
# mtime stability threshold for picking up files from acquired/
|
# mtime stability threshold for picking up files from acquired/
|
||||||
mtime_stability_seconds: 10
|
mtime_stability_seconds: 10
|
||||||
|
# Language filter: skip non-English content before Gemini enrichment
|
||||||
|
language_filter: true # Enable langdetect-based filtering
|
||||||
|
allowed_languages: # ISO 639-1 codes allowed through enrichment
|
||||||
|
- en
|
||||||
|
|
|
||||||
10
lib/api.py
10
lib/api.py
|
|
@ -2030,12 +2030,16 @@ def api_kiwix_upload():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"kiwix-manage add failed: {e}")
|
logger.warning(f"kiwix-manage add failed: {e}")
|
||||||
|
|
||||||
# Scan for new entry
|
# Scan for new entry (retry — monitorLibrary may need a moment to reload)
|
||||||
try:
|
import time as _time
|
||||||
from .zim_monitor import scan_zims
|
from .zim_monitor import scan_zims
|
||||||
|
for attempt in range(3):
|
||||||
|
try:
|
||||||
scan_zims()
|
scan_zims()
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"scan_zims after upload failed: {e}")
|
logger.warning(f"scan_zims attempt {attempt+1} failed: {e}")
|
||||||
|
_time.sleep(2)
|
||||||
|
|
||||||
# Refresh cache
|
# Refresh cache
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,15 @@ from .utils import get_config, setup_logging
|
||||||
from .status import StatusDB
|
from .status import StatusDB
|
||||||
from .utils import resolve_text_dir
|
from .utils import resolve_text_dir
|
||||||
|
|
||||||
|
try:
|
||||||
|
from langdetect import detect as _detect_lang
|
||||||
|
from langdetect.lang_detect_exception import LangDetectException
|
||||||
|
_HAS_LANGDETECT = True
|
||||||
|
except ImportError:
|
||||||
|
_HAS_LANGDETECT = False
|
||||||
|
|
||||||
|
ALLOWED_LANGUAGES = {'en'} # Default: English only
|
||||||
|
|
||||||
logger = setup_logging('recon.enricher')
|
logger = setup_logging('recon.enricher')
|
||||||
|
|
||||||
# Docs stuck in "enriching" longer than this get reset to "extracted" for retry
|
# Docs stuck in "enriching" longer than this get reset to "extracted" for retry
|
||||||
|
|
@ -341,6 +350,42 @@ def validate_and_fix_concepts(concepts, key, config):
|
||||||
return concepts
|
return concepts
|
||||||
|
|
||||||
|
|
||||||
|
def _check_language(text_dir, config):
|
||||||
|
"""Check language of document text. Returns (is_allowed, detected_lang).
|
||||||
|
|
||||||
|
Reads first 1000 chars from first page file and uses langdetect.
|
||||||
|
Returns (True, lang) if language is allowed, (False, lang) if not.
|
||||||
|
Falls back to (True, 'unknown') if detection fails (benefit of the doubt).
|
||||||
|
"""
|
||||||
|
if not _HAS_LANGDETECT:
|
||||||
|
return True, 'unknown'
|
||||||
|
|
||||||
|
# Check if language filter is enabled in config
|
||||||
|
pipeline_cfg = config.get('pipeline', {})
|
||||||
|
if not pipeline_cfg.get('language_filter', True):
|
||||||
|
return True, 'disabled'
|
||||||
|
|
||||||
|
allowed = set(pipeline_cfg.get('allowed_languages', ['en']))
|
||||||
|
|
||||||
|
# Read first page for detection
|
||||||
|
page_files = sorted([f for f in os.listdir(text_dir)
|
||||||
|
if f.startswith('page_') and f.endswith('.txt')])
|
||||||
|
if not page_files:
|
||||||
|
return True, 'no_pages'
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(os.path.join(text_dir, page_files[0]), encoding='utf-8') as f:
|
||||||
|
sample = f.read(1500)
|
||||||
|
if len(sample.strip()) < 50:
|
||||||
|
return True, 'too_short'
|
||||||
|
lang = _detect_lang(sample)
|
||||||
|
return (lang in allowed), lang
|
||||||
|
except LangDetectException:
|
||||||
|
return True, 'detection_failed'
|
||||||
|
except Exception:
|
||||||
|
return True, 'error'
|
||||||
|
|
||||||
|
|
||||||
def enrich_single(file_hash, db, config, key_rotator):
|
def enrich_single(file_hash, db, config, key_rotator):
|
||||||
doc = db.get_document(file_hash)
|
doc = db.get_document(file_hash)
|
||||||
if not doc:
|
if not doc:
|
||||||
|
|
@ -359,6 +404,14 @@ def enrich_single(file_hash, db, config, key_rotator):
|
||||||
db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
|
db.mark_failed(file_hash, f"Text directory not found: {text_dir}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Language gate: skip non-English documents before burning Gemini quota
|
||||||
|
lang_ok, detected_lang = _check_language(text_dir, config)
|
||||||
|
if not lang_ok:
|
||||||
|
logger.info(f"Skipping {file_hash[:12]}... detected language '{detected_lang}' "
|
||||||
|
f"(allowed: {config.get('pipeline', {}).get('allowed_languages', ['en'])})")
|
||||||
|
db.mark_failed(file_hash, f"Language filter: detected '{detected_lang}', not in allowed list")
|
||||||
|
return False
|
||||||
|
|
||||||
db.update_status(file_hash, 'enriching')
|
db.update_status(file_hash, 'enriching')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue