Filter non-English articles from ZIM ingestion

Skip articles with MediaWiki translation suffixes (/es, /fr, /pl, etc.)
before text extraction to avoid wasting Gemini enrichment on translations.
Uses path-based regex matching against ISO 639 language codes.

~5,276 non-English articles already ingested from Appropedia (top: es=837,
zh=765, ru=475, fr=433, ko=407). Purge decision deferred.
This commit is contained in:
Matt 2026-04-17 07:30:30 +00:00
commit 501004ecf1

View file

@ -38,6 +38,39 @@ MIN_TEXT_LENGTH = 200
# Elements to strip before text extraction
STRIP_TAGS = {'nav', 'footer', 'script', 'style', 'header', 'aside'}
# Non-English article path suffix pattern (MediaWiki ZIMs use /XX or /XXX suffixes)
# Matches paths ending in /xx where xx is a 2-3 letter lowercase language code
_LANG_SUFFIX_RE = re.compile(r'/[a-z]{2,3}$')
# Common ISO 639-1/2 language codes to filter (excludes 'en')
_NON_EN_LANGS = {
'aa','ab','af','ak','am','an','ar','as','av','ay','az',
'ba','be','bg','bh','bi','bm','bn','bo','br','bs',
'ca','ce','ch','co','cr','cs','cu','cv','cy',
'da','de','dv','dz',
'ee','el','eo','es','et','eu',
'fa','ff','fi','fj','fo','fr','fy',
'ga','gd','gl','gn','gu','gv',
'ha','he','hi','ho','hr','ht','hu','hy','hz',
'ia','id','ie','ig','ii','ik','io','is','it','iu',
'ja','jv',
'ka','kg','ki','kj','kk','kl','km','kn','ko','kr','ks','ku','kv','kw','ky',
'la','lb','lg','li','ln','lo','lt','lu','lv',
'mg','mh','mi','mk','ml','mn','mo','mr','ms','mt','my',
'na','nb','nd','ne','ng','nl','nn','no','nr','nv','ny',
'oc','oj','om','or','os',
'pa','pi','pl','ps','pt',
'qu',
'rm','rn','ro','ru','rw',
'sa','sc','sd','se','sg','sh','si','sk','sl','sm','sn','so','sq','sr','ss','st','su','sv','sw',
'ta','te','tg','th','ti','tk','tl','tn','to','tr','ts','tt','tw','ty',
'ug','uk','ur','uz',
've','vi','vo',
'wa','wo',
'xh',
'yi','yo',
'za','zh','zu',
}
def _text_hash(text):
"""Compute MD5 hash of text content (matching content_hash style)."""
@ -190,6 +223,13 @@ def ingest_zim(zim_source_id, db, config, stop_event=None,
if article_path in existing_paths:
continue
# Skip non-English articles (MediaWiki translation suffix pattern)
lang_match = _LANG_SUFFIX_RE.search(article_path)
if lang_match and lang_match.group(0)[1:] in _NON_EN_LANGS:
stats['skipped'] += 1
total_processed_this_run += 1
continue
# Extract and clean text
try:
html_bytes = bytes(item.content)