diff --git a/lib/processors/zim_processor.py b/lib/processors/zim_processor.py index ba29952..b258408 100644 --- a/lib/processors/zim_processor.py +++ b/lib/processors/zim_processor.py @@ -38,6 +38,39 @@ MIN_TEXT_LENGTH = 200 # Elements to strip before text extraction STRIP_TAGS = {'nav', 'footer', 'script', 'style', 'header', 'aside'} +# Non-English article path suffix pattern (MediaWiki ZIMs use /XX or /XXX suffixes) +# Matches paths ending in /xx where xx is a 2-3 letter lowercase language code +_LANG_SUFFIX_RE = re.compile(r'/[a-z]{2,3}$') +# Common ISO 639-1/2 language codes to filter (excludes 'en') +_NON_EN_LANGS = { + 'aa','ab','af','ak','am','an','ar','as','av','ay','az', + 'ba','be','bg','bh','bi','bm','bn','bo','br','bs', + 'ca','ce','ch','co','cr','cs','cu','cv','cy', + 'da','de','dv','dz', + 'ee','el','eo','es','et','eu', + 'fa','ff','fi','fj','fo','fr','fy', + 'ga','gd','gl','gn','gu','gv', + 'ha','he','hi','ho','hr','ht','hu','hy','hz', + 'ia','id','ie','ig','ii','ik','io','is','it','iu', + 'ja','jv', + 'ka','kg','ki','kj','kk','kl','km','kn','ko','kr','ks','ku','kv','kw','ky', + 'la','lb','lg','li','ln','lo','lt','lu','lv', + 'mg','mh','mi','mk','ml','mn','mo','mr','ms','mt','my', + 'na','nb','nd','ne','ng','nl','nn','no','nr','nv','ny', + 'oc','oj','om','or','os', + 'pa','pi','pl','ps','pt', + 'qu', + 'rm','rn','ro','ru','rw', + 'sa','sc','sd','se','sg','sh','si','sk','sl','sm','sn','so','sq','sr','ss','st','su','sv','sw', + 'ta','te','tg','th','ti','tk','tl','tn','to','tr','ts','tt','tw','ty', + 'ug','uk','ur','uz', + 've','vi','vo', + 'wa','wo', + 'xh', + 'yi','yo', + 'za','zh','zu', +} + def _text_hash(text): """Compute MD5 hash of text content (matching content_hash style).""" @@ -190,6 +223,13 @@ def ingest_zim(zim_source_id, db, config, stop_event=None, if article_path in existing_paths: continue + # Skip non-English articles (MediaWiki translation suffix pattern) + lang_match = _LANG_SUFFIX_RE.search(article_path) + if lang_match and lang_match.group(0)[1:] in _NON_EN_LANGS: + stats['skipped'] += 1 + total_processed_this_run += 1 + continue + # Extract and clean text try: html_bytes = bytes(item.content)