Filter non-English articles from ZIM ingestion

Skip articles with MediaWiki translation suffixes (/es, /fr, /pl, etc.) before text extraction to avoid wasting Gemini enrichment on translations. Uses path-based regex matching against ISO 639 language codes. ~5,276 non-English articles already ingested from Appropedia (top: es=837, zh=765, ru=475, fr=433, ko=407). Purge decision deferred.
2026-05-20 06:34:40 +02:00 · 2026-04-17 07:30:30 +00:00 · 2026-04-17 07:30:30 +00:00 · 501004ecf1
commit 501004ecf1
parent 2635160887
1 changed files with 40 additions and 0 deletions
--- a/lib/processors/zim_processor.py
+++ b/lib/processors/zim_processor.py
@ -38,6 +38,39 @@ MIN_TEXT_LENGTH = 200
 # Elements to strip before text extraction
 STRIP_TAGS = {'nav', 'footer', 'script', 'style', 'header', 'aside'}

+# Non-English article path suffix pattern (MediaWiki ZIMs use /XX or /XXX suffixes)
+# Matches paths ending in /xx where xx is a 2-3 letter lowercase language code
+_LANG_SUFFIX_RE = re.compile(r'/[a-z]{2,3}$')
+# Common ISO 639-1/2 language codes to filter (excludes 'en')
+_NON_EN_LANGS = {
+    'aa','ab','af','ak','am','an','ar','as','av','ay','az',
+    'ba','be','bg','bh','bi','bm','bn','bo','br','bs',
+    'ca','ce','ch','co','cr','cs','cu','cv','cy',
+    'da','de','dv','dz',
+    'ee','el','eo','es','et','eu',
+    'fa','ff','fi','fj','fo','fr','fy',
+    'ga','gd','gl','gn','gu','gv',
+    'ha','he','hi','ho','hr','ht','hu','hy','hz',
+    'ia','id','ie','ig','ii','ik','io','is','it','iu',
+    'ja','jv',
+    'ka','kg','ki','kj','kk','kl','km','kn','ko','kr','ks','ku','kv','kw','ky',
+    'la','lb','lg','li','ln','lo','lt','lu','lv',
+    'mg','mh','mi','mk','ml','mn','mo','mr','ms','mt','my',
+    'na','nb','nd','ne','ng','nl','nn','no','nr','nv','ny',
+    'oc','oj','om','or','os',
+    'pa','pi','pl','ps','pt',
+    'qu',
+    'rm','rn','ro','ru','rw',
+    'sa','sc','sd','se','sg','sh','si','sk','sl','sm','sn','so','sq','sr','ss','st','su','sv','sw',
+    'ta','te','tg','th','ti','tk','tl','tn','to','tr','ts','tt','tw','ty',
+    'ug','uk','ur','uz',
+    've','vi','vo',
+    'wa','wo',
+    'xh',
+    'yi','yo',
+    'za','zh','zu',
+}
+

 def _text_hash(text):
    """Compute MD5 hash of text content (matching content_hash style)."""
@ -190,6 +223,13 @@ def ingest_zim(zim_source_id, db, config, stop_event=None,
        if article_path in existing_paths:
            continue

+        # Skip non-English articles (MediaWiki translation suffix pattern)
+        lang_match = _LANG_SUFFIX_RE.search(article_path)
+        if lang_match and lang_match.group(0)[1:] in _NON_EN_LANGS:
+            stats['skipped'] += 1
+            total_processed_this_run += 1
+            continue
+
        # Extract and clean text
        try:
            html_bytes = bytes(item.content)