""" RECON Enricher Text to structured concepts via Gemini API. Saves JSON to data/concepts/{hash}/ BEFORE any DB operations. Uses 10-page windows, 4 API keys, 16 workers. Resilience: - Exponential backoff with jitter for transient errors (429, 500, 503, timeout) - Permanent errors (JSON parse, auth) fail immediately without wasting retries - Window failures skip that window and continue — partial enrichment beats zero - Document marked enriched if ANY windows succeeded, failed only if ALL failed Dependencies: google-generativeai Config: processing.enrich_workers, processing.enrich_window_size, gemini, paths.concepts """ import json import os import random import re import time import traceback from concurrent.futures import ThreadPoolExecutor, as_completed import google.generativeai as genai from .utils import get_config, setup_logging from .status import StatusDB logger = setup_logging('recon.enricher') # Docs stuck in "enriching" longer than this get reset to "extracted" for retry STALE_ENRICHING_HOURS = 2 # ── Classification allowlists ─────────────────────────────────────────────── VALID_DOMAINS = { 'Agriculture & Livestock', 'Civil Organization', 'Communications', 'Food Systems', 'Foundational Skills', 'Logistics', 'Medical', 'Navigation', 'Operations', 'Power Systems', 'Preservation & Storage', 'Security', 'Shelter & Construction', 'Technology', 'Tools & Equipment', 'Vehicles', 'Water Systems', 'Wilderness Skills', } VALID_KNOWLEDGE_TYPES = {'foundational', 'procedural', 'operational'} VALID_COMPLEXITIES = {'basic', 'intermediate', 'advanced'} DOMAIN_FALLBACK = 'Foundational Skills' KNOWLEDGE_TYPE_FALLBACK = 'foundational' COMPLEXITY_FALLBACK = 'basic' def repair_json(text): """Attempt to repair common LLM JSON output issues including truncation.""" # Remove control characters except newlines and tabs text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text) # Fix invalid JSON escape sequences (e.g. \e, \p, \c from Gemini) # Valid JSON escapes: \", \\, \/, \b, \f, \n, \r, \t, \uXXXX text = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', text) # Remove trailing commas before } or ] text = re.sub(r',\s*([}\]])', r'\1', text) # Handle truncated JSON: try to find the last complete object in the array try: json.loads(text, strict=False) return text except json.JSONDecodeError: pass # Find the last complete }, then close the array # Walk backward to find the last valid closing brace last_complete = -1 depth_brace = 0 depth_bracket = 0 in_string = False escape = False for i, ch in enumerate(text): if escape: escape = False continue if ch == '\\' and in_string: escape = True continue if ch == '"' and not escape: in_string = not in_string continue if in_string: continue if ch == '{': depth_brace += 1 elif ch == '}': depth_brace -= 1 if depth_brace == 0: last_complete = i elif ch == '[': depth_bracket += 1 elif ch == ']': depth_bracket -= 1 if last_complete > 0: truncated = text[:last_complete + 1].rstrip().rstrip(',') # Close any open arrays open_brackets = truncated.count('[') - truncated.count(']') truncated += ']' * open_brackets return truncated return text ENRICH_PROMPT = """Extract knowledge concepts from this document text. A concept is a SELF-CONTAINED piece of knowledge that can stand alone. For each concept, provide ALL fields: Required: - content: Full text of the concept (complete procedure, definition, etc.) - summary: 1-2 sentence summary - title: Brief descriptive title - domain: must be exactly one of: Agriculture & Livestock, Civil Organization, Communications, Food Systems, Foundational Skills, Logistics, Medical, Navigation, Operations, Power Systems, Preservation & Storage, Security, Shelter & Construction, Technology, Tools & Equipment, Vehicles, Water Systems, Wilderness Skills — return ONLY this exact string, no variations, no new domains, no underscores, no synonyms CRITICAL: Medical content (first aid, anatomy, pharmacology, herbs, veterinary, austere medicine) → Medical CRITICAL: Food growing, farming, animal husbandry, livestock → Agriculture & Livestock CRITICAL: Foraging, hunting, fishing, bushcraft, wilderness survival → Wilderness Skills CRITICAL: Food preservation, storage, canning, dehydration, processing → Preservation & Storage CRITICAL: Solar, wind, hydro, batteries, generators → Power Systems CRITICAL: Water sourcing, filtration, sanitation, purification → Water Systems CRITICAL: Building, carpentry, structural construction, shelter → Shelter & Construction CRITICAL: Tactical operations, mission execution, combat maneuvers, search & rescue → Operations CRITICAL: Governance, civil administration, community leadership → Civil Organization CRITICAL: Electronics, IT, computing, engineering → Technology CRITICAL: Hand tools, power tools, equipment maintenance → Tools & Equipment CRITICAL: Motor vehicles, aircraft, watercraft, vehicle maintenance → Vehicles CRITICAL: Radio, signals, networking, comms equipment → Communications CRITICAL: Supply chain, transport, distribution, inventory → Logistics CRITICAL: Physical security, OPSEC, threat assessment → Security CRITICAL: Map reading, orienteering, GPS, celestial navigation → Navigation CRITICAL: Cooking methods, food production, recipes, nutrition → Food Systems - subdomain: Array of specific subcategories (up to 10) - keywords: Array of 3-30 searchable terms - knowledge_type: foundational | procedural | operational foundational — concepts, definitions, theory, background knowledge, explanations of how things work procedural — step-by-step techniques, instructions, how-to skills, methods you execute operational — application under real conditions, decision-making, mission execution, judgment calls in context Valid values are ONLY: foundational, procedural, operational — do not use any other values - complexity: basic | intermediate | advanced basic — requires little or no prior knowledge, introductory material, simple concepts intermediate — requires some domain familiarity, assumes foundational knowledge is in place advanced — requires significant experience or expertise, high-stakes or highly technical material Valid values are ONLY: basic, intermediate, advanced — do not use any other values - key_facts: Array of specific extractable claims, measurements, data points Optional (include when present): - scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki - cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination - chapter: Chapter name if identifiable - page_ref: Page reference - notes: Any additional context EXAMPLES (knowledge_type + complexity): - "Needle chest decompression procedure" → knowledge_type: "procedural", complexity: "advanced" - "What is soil texture and why does it matter" → knowledge_type: "foundational", complexity: "basic" - "Coordinating a fire team withdrawal under contact" → knowledge_type: "operational", complexity: "advanced" Return JSON array. If no extractable concepts, return []. Document text: """ class KeyRotator: def __init__(self, keys): self.keys = keys self.index = 0 def next(self): if not self.keys: raise ValueError("No Gemini API keys configured") key = self.keys[self.index % len(self.keys)] self.index += 1 return key def enrich_window(text, key, config): genai.configure(api_key=key) model = genai.GenerativeModel( config['gemini']['model'], generation_config={"response_mime_type": config['gemini']['response_mime_type']} ) response = model.generate_content(ENRICH_PROMPT + text) raw = response.text try: result = json.loads(raw, strict=False) except json.JSONDecodeError: repaired = repair_json(raw) result = json.loads(repaired, strict=False) # Filter out non-dict items (nested lists from truncated responses) if isinstance(result, list): result = [c for c in result if isinstance(c, dict)] return result def _is_transient(error_str): """Classify whether an error is transient (worth retrying) or permanent.""" s = error_str.lower() transient_signals = ['429', 'resource_exhausted', 'quota', 'rate', '500', '503', 'unavailable', 'timeout', 'connection', 'reset by peer', 'broken pipe'] return any(sig in s for sig in transient_signals) def _retry_with_backoff(fn, max_retries=5, base_delay=5.0, max_delay=120.0): """Retry with exponential backoff + jitter for transient errors. Backoff: ~5s, ~10s, ~20s, ~40s, ~80s (total ~155s before giving up). Permanent errors (JSON parse, auth) raise immediately without retrying. """ last_exc = None for attempt in range(max_retries): try: return fn() except Exception as e: last_exc = e err = str(e) if not _is_transient(err): raise # permanent — don't waste retries if attempt < max_retries - 1: delay = min(base_delay * (2 ** attempt) + random.uniform(0, base_delay), max_delay) logger.info(f" Transient error (attempt {attempt+1}/{max_retries}), " f"retrying in {delay:.0f}s: {err[:120]}") time.sleep(delay) else: logger.warning(f" Transient error, max retries exhausted: {err[:150]}") raise last_exc def _reclassify_field(field_name, allowlist, concept, key, config, max_retries=3): """Retry Gemini up to max_retries to get a valid value for a specific field.""" content = concept.get('content', concept.get('summary', '')) if isinstance(content, str): content = content[:400] else: content = str(content)[:400] title = concept.get('title', '(untitled)') allowlist_str = ', '.join(sorted(allowlist)) for attempt in range(max_retries): try: prompt = ( f"Your previous response for '{field_name}' was invalid. " f"You must return ONLY one of these exact strings: {allowlist_str}\n\n" f"Title: {title}\n" f"Content: {content}\n\n" f"Return ONLY the exact string, nothing else. No explanation, no punctuation, no quotes." ) genai.configure(api_key=key) model = genai.GenerativeModel( config['gemini']['model'], generation_config={"response_mime_type": "text/plain"} ) resp = model.generate_content(prompt) value = resp.text.strip().strip('"').strip("'").strip() if value in allowlist: return value # Try case-insensitive match for knowledge_type/complexity for valid in allowlist: if value.lower() == valid.lower(): return valid except Exception as e: err = str(e).lower() if any(s in err for s in ['429', 'quota', 'rate', '503']): time.sleep(min(3 * (2 ** attempt) + random.uniform(0, 2), 30)) else: logger.warning(f" Reclassify retry {attempt+1} for {field_name} failed: {e}") return None def validate_and_fix_concepts(concepts, key, config): """Validate domain, knowledge_type, complexity on each concept. For invalid values: retry Gemini up to 3 times, then apply safe fallback. """ for concept in concepts: if not isinstance(concept, dict): continue # ── Validate domain ───────────────────────────────────────────── domain = concept.get('domain') if isinstance(domain, list): # Legacy array format — find first valid or reclassify valid = [d for d in domain if d in VALID_DOMAINS] if valid: concept['domain'] = valid[0] else: new_val = _reclassify_field('domain', VALID_DOMAINS, concept, key, config) if new_val: concept['domain'] = new_val else: logger.warning(f"Invalid domain {domain} for '{concept.get('title', '?')}', using fallback") concept['domain'] = DOMAIN_FALLBACK elif isinstance(domain, str): if domain not in VALID_DOMAINS: new_val = _reclassify_field('domain', VALID_DOMAINS, concept, key, config) if new_val: concept['domain'] = new_val else: logger.warning(f"Invalid domain '{domain}' for '{concept.get('title', '?')}', using fallback") concept['domain'] = DOMAIN_FALLBACK else: concept['domain'] = DOMAIN_FALLBACK # ── Validate knowledge_type ───────────────────────────────────── kt = concept.get('knowledge_type', '') if isinstance(kt, str): kt = kt.lower().strip() else: kt = '' if kt not in VALID_KNOWLEDGE_TYPES: new_val = _reclassify_field('knowledge_type', VALID_KNOWLEDGE_TYPES, concept, key, config) if new_val: concept['knowledge_type'] = new_val else: logger.warning(f"Invalid knowledge_type '{kt}' for '{concept.get('title', '?')}', using fallback") concept['knowledge_type'] = KNOWLEDGE_TYPE_FALLBACK else: concept['knowledge_type'] = kt # ── Validate complexity ───────────────────────────────────────── cx = concept.get('complexity', '') if isinstance(cx, str): cx = cx.lower().strip() else: cx = '' if cx not in VALID_COMPLEXITIES: new_val = _reclassify_field('complexity', VALID_COMPLEXITIES, concept, key, config) if new_val: concept['complexity'] = new_val else: logger.warning(f"Invalid complexity '{cx}' for '{concept.get('title', '?')}', using fallback") concept['complexity'] = COMPLEXITY_FALLBACK else: concept['complexity'] = cx return concepts def enrich_single(file_hash, db, config, key_rotator): doc = db.get_document(file_hash) if not doc: return False text_dir = os.path.join(config['paths']['text'], file_hash) concepts_dir = os.path.join(config['paths']['concepts'], file_hash) window_size = config['processing']['enrich_window_size'] delay = config['processing']['rate_limit_delay'] proc = config.get('processing', {}) max_retries = proc.get('enrich_max_retries', proc.get('max_retries', 5)) base_delay = proc.get('enrich_base_delay', 5.0) max_delay = proc.get('enrich_max_delay', 120.0) if not os.path.exists(text_dir): db.mark_failed(file_hash, f"Text directory not found: {text_dir}") return False db.update_status(file_hash, 'enriching') try: os.makedirs(concepts_dir, exist_ok=True) page_files = sorted([f for f in os.listdir(text_dir) if f.startswith('page_') and f.endswith('.txt')]) if not page_files: db.mark_failed(file_hash, "No page files found") return False pages_text = [] for pf in page_files: with open(os.path.join(text_dir, pf), encoding='utf-8') as f: pages_text.append(f.read()) windows = [] for i in range(0, len(pages_text), window_size): window_pages = pages_text[i:i + window_size] combined = "\n\n".join(f"--- Page {i + j + 1} ---\n{t}" for j, t in enumerate(window_pages)) windows.append((i, combined)) total_concepts = 0 failed_windows = [] for w_idx, (start_page, window_text) in enumerate(windows): window_file = os.path.join(concepts_dir, f"window_{w_idx+1:04d}.json") if os.path.exists(window_file): with open(window_file, encoding='utf-8') as f: existing = json.load(f) total_concepts += len(existing) logger.debug(f" Window {w_idx+1} already exists, skipping") continue if len(window_text.strip()) < 50: with open(window_file, 'w') as f: json.dump([], f) continue # Attempt enrichment with backoff — failures skip the window, not the doc try: key = key_rotator.next() concepts = _retry_with_backoff( lambda k=key: enrich_window(window_text, k, config), max_retries=max_retries, base_delay=base_delay, max_delay=max_delay, ) except Exception as e: failed_windows.append((w_idx + 1, str(e)[:100])) logger.warning(f" Window {w_idx+1}/{len(windows)} failed: {e}") continue # skip this window, keep going if not isinstance(concepts, list): concepts = [concepts] if isinstance(concepts, dict) else [] concepts = [c for c in concepts if isinstance(c, dict)] # Validate domain, knowledge_type, complexity — retry then fallback validation_key = key_rotator.next() concepts = validate_and_fix_concepts(concepts, validation_key, config) for c_idx, concept in enumerate(concepts): concept['_window'] = w_idx + 1 concept['_start_page'] = start_page + 1 concept['_doc_hash'] = file_hash # JSON FIRST: save before anything else with open(window_file, 'w', encoding='utf-8') as f: json.dump(concepts, f, indent=2, ensure_ascii=False) total_concepts += len(concepts) logger.debug(f" Window {w_idx+1}/{len(windows)}: {len(concepts)} concepts") time.sleep(delay) # Decide document status based on results meta = { 'hash': file_hash, 'total_windows': len(windows), 'total_concepts': total_concepts, 'failed_windows': len(failed_windows), 'window_size': window_size, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), } with open(os.path.join(concepts_dir, 'meta.json'), 'w') as f: json.dump(meta, f, indent=2) if total_concepts > 0 or not failed_windows: # Some concepts extracted, or all windows were empty — mark enriched error_msg = None if total_concepts == 0 and doc.get('page_count', 0) >= 3: error_msg = (f"0 concepts from {doc.get('page_count', '?')} pages — " f"likely image-only PDF, may need manual review") logger.warning(f" {doc['filename']}: {error_msg}") elif failed_windows: wins = ', '.join(str(w) for w, _ in failed_windows[:10]) error_msg = (f"Partial: {len(failed_windows)}/{len(windows)} " f"windows failed (windows {wins})") logger.warning(f" {doc['filename']}: {error_msg}") db.update_status(file_hash, 'enriched', concepts_extracted=total_concepts, error_message=error_msg) fw_note = f", {len(failed_windows)} windows failed" if failed_windows else "" logger.info(f"Enriched {doc['filename']}: {total_concepts} concepts " f"from {len(windows)} windows{fw_note}") return True else: # Every window failed — document truly failed first_err = failed_windows[0][1] if failed_windows else 'unknown' db.mark_failed(file_hash, f"All {len(windows)} windows failed: {first_err}") logger.error(f" {doc['filename']}: all {len(windows)} windows failed") return False except Exception as e: logger.error(f"Enrichment failed for {file_hash}: {e}\n{traceback.format_exc()}") db.mark_failed(file_hash, str(e)) return False def _recover_stale_enriching(db, max_hours=STALE_ENRICHING_HOURS): """Reset docs stuck in enriching back to extracted so they get retried. This handles the case where a previous enrichment run crashed mid-document. The enricher skips already-completed window files, so no work is lost. """ import sqlite3 conn = db._get_conn() rows = conn.execute( "SELECT hash, filename FROM documents WHERE status = 'enriching'", ).fetchall() if not rows: return # Check extracted_at timestamp — if enriching started > max_hours ago, reset now = __import__('datetime').datetime.now(__import__('datetime').timezone.utc) reset = [] for row in rows: doc = db.get_document(row['hash']) extracted_at = doc.get('extracted_at', '') if not extracted_at: reset.append(row) continue try: from datetime import datetime, timezone ts = datetime.fromisoformat(extracted_at) if ts.tzinfo is None: ts = ts.replace(tzinfo=timezone.utc) age_hours = (now - ts).total_seconds() / 3600 if age_hours > max_hours: reset.append(row) except Exception: reset.append(row) for row in reset: conn.execute( "UPDATE documents SET status = 'extracted' WHERE hash = ?", (row['hash'],) ) logger.warning(f"Recovered stale enriching doc: {row['filename']} ({row['hash'][:12]}...)") if reset: conn.commit() logger.info(f"Reset {len(reset)} stale enriching docs back to extracted") def run_enrichment(workers=None, limit=None): config = get_config() db = StatusDB() workers = workers or config['processing']['enrich_workers'] # Recover docs orphaned by previous crashed enrichment runs _recover_stale_enriching(db) keys = config.get('gemini_keys', []) if not keys: logger.error("No Gemini API keys configured in .env") return 0 key_rotator = KeyRotator(keys) extracted = db.get_by_status('extracted', limit=limit) if not extracted: logger.info("No extracted documents to enrich") return 0 logger.info(f"Enriching {len(extracted)} documents with {workers} workers, {len(keys)} API key(s)") success = 0 with ThreadPoolExecutor(max_workers=workers) as pool: futures = { pool.submit(enrich_single, doc['hash'], StatusDB(), config, key_rotator): doc for doc in extracted } for future in as_completed(futures): doc = futures[future] try: if future.result(): success += 1 except Exception as e: logger.error(f"Worker error for {doc['hash']}: {e}") logger.info(f"Enrichment complete: {success}/{len(extracted)} succeeded") return success