""" RECON Stream B — New Library Pipeline Handles file acquisition, standardized naming (from enriched book_title), collision resolution, and library placement for new and existing documents. Two-phase ingest for new files: Phase A (ingest_acquire): _acquired/ -> _ingest/ (preserve original name, queue for processing) Phase B (ingest_place): _ingest/ -> library (after enrichment populates book_title) Migration for existing files: migrate_civil_org(): Rename/move enriched Civil Org docs to standardized filenames All moves are logged to file_operations table for audit and rollback. """ import json import logging import os import re import shutil import signal import time from .utils import ( content_hash, generate_download_url, sanitize_filename, get_config, ) from .organizer import ( DOMAIN_FOLDERS, normalize_folder_name, determine_dominant_domain, ) from .status import StatusDB logger = logging.getLogger('recon.pipeline') # ── Filename Helpers ───────────────────────────────────────────────────── def pipeline_sanitize_filename(title_or_filename, doc_hash=None): """Sanitize a title/filename and convert spaces to underscores. Primary input should be book_title + ".pdf" from documents table. Falls back to raw filename if title is empty. Returns sanitized filename with underscores instead of spaces. """ if not title_or_filename or not title_or_filename.strip(): return None # Ensure .pdf extension if not title_or_filename.lower().endswith('.pdf'): title_or_filename = title_or_filename + '.pdf' # Use existing sanitizer for base cleanup sanitized = sanitize_filename(title_or_filename, doc_hash=doc_hash) # Convert spaces to underscores stem, ext = os.path.splitext(sanitized) stem = stem.replace(' ', '_') # Collapse multiple underscores stem = re.sub(r'_{2,}', '_', stem) # Strip leading/trailing underscores stem = stem.strip('_') if not stem: return None return stem + ext def extract_author_last_name(book_author): """Extract the last name from a book_author string. "John Smith" -> "Smith" "Smith, John" -> "Smith" Returns None for empty, organizational, or unparse-able authors. """ if not book_author: return None author = book_author.strip() # Skip non-personal authors skip_patterns = [ r'^\(YT\)', # YouTube-sourced r'^Wikipedia', r'^null$', r'^unknown$', r'^N/A$', r'^Various', r'^Staff', r'^Anonymous', r'Department\b', r'Committee\b', r'Organization\b', r'Institute\b', r'Association\b', r'Foundation\b', r'University\b', r'Government\b', r'Bureau\b', r'Agency\b', r'Office\b', r'Division\b', r'Center\b', r'Centre\b', r'Ministry\b', r'Commission\b', r'Council\b', r'Board\b', r'Corps\b', r'Army\b', r'Navy\b', r'Air Force\b', r'Marine\b', r'U\.?S\.?\b', r'United States\b', ] for pattern in skip_patterns: if re.search(pattern, author, re.IGNORECASE): return None # "Last, First" format if ',' in author: parts = author.split(',') last = parts[0].strip() if last and len(last) > 1: return last # "First Last" or "First Middle Last" format parts = author.split() if len(parts) >= 2: last = parts[-1].strip() # Skip if last part looks like a suffix if last.lower() in ('jr', 'jr.', 'sr', 'sr.', 'ii', 'iii', 'iv', 'phd', 'md', 'do'): if len(parts) >= 3: last = parts[-2].strip() else: return None if last and len(last) > 1 and last[0].isupper(): return last return None def extract_year(filename, book_title=None): """Extract a publication year from filename or title. Looks for 4-digit years (1800-2099). Returns the last match (more likely to be publication year vs chapter/section numbers). """ years = [] for text in [filename, book_title]: if not text: continue matches = re.findall(r'\b(1[89]\d{2}|20\d{2})\b', text) years.extend(matches) if years: return years[-1] return None # ── Collision Resolution ───────────────────────────────────────────────── def resolve_collision(sanitized_name, target_dir, doc_hash, author, title, orig_filename, db): """Resolve filename collisions using a 4-step ladder. Step 1: Title.pdf (base name) Step 2: Title_AuthorLast.pdf (if author available) Step 3: Title_AuthorLast_Year.pdf (if both author AND year available) Step 4: -> _duplicates/ + duplicate_review table If existing file has same content hash, it's the same file (not a collision). Returns: (final_filename, collision_step, is_duplicate) """ target_path = os.path.join(target_dir, sanitized_name) # Step 1: Base name if not os.path.exists(target_path): return (sanitized_name, 1, False) # Check if same file (same content hash) existing_hash = content_hash(target_path) if existing_hash == doc_hash: return (sanitized_name, 1, False) # Same file, overwrite is fine stem, ext = os.path.splitext(sanitized_name) author_last = extract_author_last_name(author) # Step 2: Add author last name if author_last: name_with_author = f"{stem}_{author_last}{ext}" target_path_2 = os.path.join(target_dir, name_with_author) if not os.path.exists(target_path_2): return (name_with_author, 2, False) existing_hash_2 = content_hash(target_path_2) if existing_hash_2 == doc_hash: return (name_with_author, 2, False) # Step 3: Add year year = extract_year(orig_filename, title) if year: name_with_year = f"{stem}_{author_last}_{year}{ext}" target_path_3 = os.path.join(target_dir, name_with_year) if not os.path.exists(target_path_3): return (name_with_year, 3, False) existing_hash_3 = content_hash(target_path_3) if existing_hash_3 == doc_hash: return (name_with_year, 3, False) # Step 4: Duplicate review return (sanitized_name, 4, True) # ── File Operations ────────────────────────────────────────────────────── def move_file(source, target, doc_hash, db, orig_filename, collision_step=1, notes=None): """Move a file and log the operation. Uses os.rename() for same-volume moves (atomic on NFS), falls back to shutil.move() on cross-device errors. Returns True on success, False on failure. """ source_filename = os.path.basename(source) target_filename = os.path.basename(target) try: target_dir = os.path.dirname(target) os.makedirs(target_dir, exist_ok=True) try: os.rename(source, target) except OSError: # Cross-device — fall back to shutil.move shutil.move(source, target) # Verify if not os.path.exists(target): logger.error("Move failed — target not found after move: %s", target) return False # Log the operation db.log_file_operation( doc_hash=doc_hash, operation='move', source_path=source, target_path=target, source_filename=source_filename, target_filename=target_filename, original_filename=orig_filename, collision_step=collision_step, notes=notes, ) return True except Exception as e: logger.error("Failed to move %s -> %s: %s", source, target, e) # Attempt rollback if target was partially written if os.path.exists(target) and not os.path.exists(source): try: os.rename(target, source) logger.info("Rolled back partial move: %s -> %s", target, source) except Exception: pass return False def update_qdrant_payload(doc_hash, new_path, new_filename, original_filename, config): """Update Qdrant payloads for all points matching doc_hash. Sets download_url, filename, and original_filename. Returns count of points updated, 0 if Qdrant unreachable. """ try: from qdrant_client import QdrantClient from qdrant_client.models import FieldCondition, MatchValue, Filter except ImportError: logger.warning("qdrant_client not installed — skipping Qdrant update") return 0 library_root = config.get('library_root', '/mnt/library') new_url = generate_download_url(new_path, library_root) try: qdrant = QdrantClient( host=config['vector_db']['host'], port=config['vector_db']['port'], timeout=60, ) collection = config['vector_db']['collection'] hits = qdrant.scroll( collection_name=collection, scroll_filter=Filter(must=[ FieldCondition(key="doc_hash", match=MatchValue(value=doc_hash)) ]), limit=2000, with_payload=False, ) point_ids = [p.id for p in hits[0]] if not point_ids: return 0 payload = { "download_url": new_url, "filename": new_filename, "original_filename": original_filename, } qdrant.set_payload( collection_name=collection, payload=payload, points=point_ids, ) logger.debug("Updated %d Qdrant points for %s", len(point_ids), doc_hash[:8]) return len(point_ids) except Exception as e: logger.warning("Qdrant update failed for %s: %s", doc_hash[:8], e) return 0 def reverse_operation(operation_id, db, config): """Reverse a file operation by moving the file back. Returns True on success, False on failure. """ op = db.get_file_operation(operation_id) if not op: logger.error("Operation %d not found", operation_id) return False if op['reversed_at']: logger.warning("Operation %d already reversed", operation_id) return False source = op['target_path'] # current location target = op['source_path'] # original location if not os.path.exists(source): logger.error("Cannot reverse — file not at target: %s", source) return False try: target_dir = os.path.dirname(target) os.makedirs(target_dir, exist_ok=True) try: os.rename(source, target) except OSError: shutil.move(source, target) if not os.path.exists(target): logger.error("Reverse failed — file not at original location: %s", target) return False # Update DB db.mark_operation_reversed(operation_id) # Update catalogue and documents back to original path doc_hash = op['doc_hash'] db.update_catalogue_path(doc_hash, target, op['source_filename']) db.sync_document_path(doc_hash, target, op['source_filename']) # Clear organized_at so Phase B can re-trigger placement conn = db._get_conn() conn.execute('UPDATE documents SET organized_at = NULL WHERE hash = ?', (doc_hash,)) conn.commit() # Update Qdrant update_qdrant_payload( doc_hash, target, op['source_filename'], op.get('original_filename'), config, ) logger.info("Reversed operation %d: %s -> %s", operation_id, source, target) return True except Exception as e: logger.error("Failed to reverse operation %d: %s", operation_id, e) return False # ── Two-Phase Ingest ───────────────────────────────────────────────────── def ingest_acquire(filepath, db, config): """Phase A: Move a stable PDF from _acquired/ to _ingest/ and queue for processing. Returns dict with: hash, action, source, target, error """ result = {'hash': None, 'action': 'skip', 'source': filepath, 'target': None, 'error': None} if not os.path.exists(filepath): result['error'] = 'File not found' return result try: file_hash = content_hash(filepath) result['hash'] = file_hash except Exception as e: result['error'] = f'Hash failed: {e}' return result # Check if hash already in catalogue (true duplicate) doc = db.get_document(file_hash) if doc and doc.get('status') == 'complete': # Already processed — move to _duplicates/ dup_dir = config.get('new_pipeline', {}).get('duplicates_dir', '/mnt/library/_ingest/_duplicates') dup_target = os.path.join(dup_dir, os.path.basename(filepath)) try: os.makedirs(dup_dir, exist_ok=True) shutil.move(filepath, dup_target) except Exception: pass result['action'] = 'duplicate' result['target'] = dup_target return result # Move to _ingest/ preserving original filename ingest_dir = config.get('new_pipeline', {}).get('ingest_dir', '/mnt/library/_ingest') filename = os.path.basename(filepath) target = os.path.join(ingest_dir, filename) # Handle name collision in _ingest/ if os.path.exists(target): stem, ext = os.path.splitext(filename) target = os.path.join(ingest_dir, f"{stem}_{file_hash[:6]}{ext}") try: os.makedirs(ingest_dir, exist_ok=True) shutil.move(filepath, target) except Exception as e: result['error'] = f'Move to ingest failed: {e}' return result result['target'] = target # Add to catalogue + queue for pipeline size = os.path.getsize(target) db.add_to_catalogue(file_hash, filename, target, size, '_ingest', 'Pipeline') db.queue_document(file_hash) result['action'] = 'acquired' logger.info("Acquired %s -> %s [%s]", filename, target, file_hash[:8]) return result def ingest_place(doc_hash, db, config): """Phase B: Move an enriched document from _ingest/ to its final library location. Only called for docs where status='complete' AND organized_at IS NULL. Returns dict with: hash, action, source, target, domain, subdomain, collision_step, error, qdrant_points_updated """ result = { 'hash': doc_hash, 'action': 'skip', 'source': None, 'target': None, 'domain': None, 'subdomain': None, 'collision_step': None, 'error': None, 'qdrant_points_updated': 0, } doc = db.get_document(doc_hash) if not doc: result['error'] = 'Document not found' return result current_path = doc.get('path', '') result['source'] = current_path if not current_path or not os.path.exists(current_path): result['error'] = 'File not found on disk' return result # Only process files in _ingest/ ingest_dir = config.get('new_pipeline', {}).get('ingest_dir', '/mnt/library/_ingest') if ingest_dir not in current_path: result['action'] = 'skip_not_ingest' return result # Check pilot domain restriction pilot_domain = config.get('new_pipeline', {}).get('pilot_domain') # Get enriched metadata book_title = doc.get('book_title') book_author = doc.get('book_author') orig_filename = doc.get('filename', os.path.basename(current_path)) # Determine domain data_dir = config['paths']['data'] domain, subdomain, confidence = determine_dominant_domain(doc_hash, data_dir) result['domain'] = domain result['subdomain'] = subdomain if domain is None: result['action'] = 'skip_unclassified' return result if pilot_domain and domain != pilot_domain: result['action'] = 'skip_wrong_domain' return result # Build standardized filename from book_title if book_title: san_name = pipeline_sanitize_filename(book_title + '.pdf', doc_hash) else: # Fallback to current filename san_name = pipeline_sanitize_filename(orig_filename, doc_hash) logger.warning("No book_title for %s, using filename: %s", doc_hash[:8], orig_filename) if not san_name: result['error'] = 'Sanitization produced empty filename' return result # Build target directory library_root = config['library_root'] domain_folder = DOMAIN_FOLDERS.get(domain, normalize_folder_name(domain)) sub_folder = normalize_folder_name(subdomain) if subdomain else 'General' target_dir = os.path.join(library_root, domain_folder, sub_folder) # Resolve collisions final_name, step, is_duplicate = resolve_collision( san_name, target_dir, doc_hash, book_author, book_title, orig_filename, db ) result['collision_step'] = step if is_duplicate: # Move to _duplicates/ dup_dir = config.get('new_pipeline', {}).get('duplicates_dir', '/mnt/library/_ingest/_duplicates') dup_target = os.path.join(dup_dir, os.path.basename(current_path)) try: os.makedirs(dup_dir, exist_ok=True) shutil.move(current_path, dup_target) except Exception: pass # Find the hash of the colliding file collision_path = os.path.join(target_dir, san_name) collision_hash = content_hash(collision_path) if os.path.exists(collision_path) else None db.queue_duplicate_review( doc_hash=doc_hash, original_filename=orig_filename, sanitized_filename=san_name, collision_with_hash=collision_hash, collision_path=collision_path, duplicate_path=dup_target, domain=domain, subdomain=subdomain, book_author=book_author, book_title=book_title, ) result['action'] = 'duplicate_review' result['target'] = dup_target return result # Move file to final location target_path = os.path.join(target_dir, final_name) if not move_file(current_path, target_path, doc_hash, db, orig_filename, step): result['error'] = 'Move failed' return result result['target'] = target_path result['action'] = 'placed' # Update catalogue and documents table db.update_catalogue_path(doc_hash, target_path, final_name) db.sync_document_path(doc_hash, target_path, final_name) db.mark_organized(doc_hash) # Update Qdrant payloads points = update_qdrant_payload(doc_hash, target_path, final_name, orig_filename, config) result['qdrant_points_updated'] = points # Update file_operations with Qdrant count if points > 0: conn = db._get_conn() conn.execute( "UPDATE file_operations SET qdrant_points_updated = ? " "WHERE doc_hash = ? AND reversed_at IS NULL ORDER BY performed_at DESC LIMIT 1", (points, doc_hash) ) conn.commit() logger.info("Placed %s -> %s [%s/%s, step %d, %d vectors]", doc_hash[:8], target_path, domain, subdomain, step, points) return result def ingest_scan(db, config): """Run both ingest phases in a single scan cycle. Phase A: Scan _acquired/ for stable PDFs -> ingest_acquire() Phase B: Query enriched-but-unplaced docs -> ingest_place() Returns dict with: acquired, placed, skipped, failed, duplicates """ stats = {'acquired': 0, 'placed': 0, 'skipped': 0, 'failed': 0, 'duplicates': 0} # Phase A: Scan _acquired/ acquired_dir = config.get('new_pipeline', {}).get('acquired_dir', '/mnt/library/_acquired') mtime_stability = config.get('new_pipeline', {}).get('mtime_stability', 10) if os.path.isdir(acquired_dir): now = time.time() for fname in os.listdir(acquired_dir): if not fname.lower().endswith('.pdf'): continue filepath = os.path.join(acquired_dir, fname) if not os.path.isfile(filepath): continue # Check mtime stability (file is fully written) mtime = os.path.getmtime(filepath) if now - mtime < mtime_stability: continue result = ingest_acquire(filepath, db, config) if result['action'] == 'acquired': stats['acquired'] += 1 elif result['action'] == 'duplicate': stats['duplicates'] += 1 elif result.get('error'): stats['failed'] += 1 # Phase B: Place enriched-but-unplaced documents from _ingest/ ingest_dir = config.get('new_pipeline', {}).get('ingest_dir', '/mnt/library/_ingest') unplaced = db.get_ingest_pending(ingest_dir, limit=50) for doc_row in unplaced: result = ingest_place(doc_row['hash'], db, config) if result['action'] == 'placed': stats['placed'] += 1 elif result['action'] == 'duplicate_review': stats['duplicates'] += 1 elif result.get('error'): stats['failed'] += 1 else: stats['skipped'] += 1 return stats # ── Domain Migration ───────────────────────────────────────────────────── def migrate_domain(domain_name, db, config, dry_run=False): """Migrate documents in a domain directory to standardized filenames. Walks /mnt/library//**/*.pdf, looks up enriched metadata from the documents table, and renames/moves to standardized filenames derived from book_title. Args: domain_name: Domain name as it appears in documents/Qdrant (e.g., "Logistics", "Civil Organization") db: StatusDB instance config: RECON config dict dry_run: If True, report what would happen without moving Returns summary dict. """ library_root = config['library_root'] data_dir = config['paths']['data'] # Resolve domain folder name domain_folder = DOMAIN_FOLDERS.get(domain_name) if not domain_folder: domain_folder = normalize_folder_name(domain_name) domain_dir = os.path.join(library_root, domain_folder) stats = { 'total': 0, 'moved': 0, 'renamed': 0, 'skipped': 0, 'already_correct': 0, 'failed': 0, 'duplicates': 0, 'domain_mismatch': 0, 'no_book_title': 0, 'not_catalogued': 0, 'errors': [], } if not os.path.isdir(domain_dir): logger.error("%s directory not found: %s", domain_name, domain_dir) return stats # Collect all PDFs pdf_files = [] for root, dirs, files in os.walk(domain_dir): dirs[:] = [d for d in dirs if not d.startswith('_')] for fname in files: if fname.lower().endswith('.pdf'): pdf_files.append(os.path.join(root, fname)) stats['total'] = len(pdf_files) logger.info("%s migration: found %d PDFs (dry_run=%s)", domain_name, len(pdf_files), dry_run) for filepath in sorted(pdf_files): current_filename = os.path.basename(filepath) # Hash and look up in DB try: file_hash = content_hash(filepath) except Exception as e: stats['failed'] += 1 stats['errors'].append(f"Hash error: {filepath}: {e}") continue doc = db.get_document(file_hash) if not doc: stats['not_catalogued'] += 1 logger.warning("Not catalogued: %s [%s]", filepath, file_hash[:8]) continue # Check dominant domain — only proceed if it matches the target domain domain, subdomain, confidence = determine_dominant_domain(file_hash, data_dir) if domain != domain_name: stats['domain_mismatch'] += 1 if not dry_run: logger.info("Domain mismatch for %s: %s (conf=%.2f), skipping", file_hash[:8], domain, confidence) continue # Get book_title for standardized filename book_title = doc.get('book_title') book_author = doc.get('book_author') orig_filename = current_filename if book_title: san_name = pipeline_sanitize_filename(book_title + '.pdf', file_hash) else: san_name = pipeline_sanitize_filename(current_filename, file_hash) stats['no_book_title'] += 1 if not dry_run: logger.warning("No book_title for %s, using filename", file_hash[:8]) if not san_name: stats['failed'] += 1 stats['errors'].append(f"Empty sanitized name: {filepath}") continue # Build target directory sub_folder = normalize_folder_name(subdomain) if subdomain else 'General' target_dir = os.path.join(library_root, domain_folder, sub_folder) target_path = os.path.join(target_dir, san_name) # Check if already at correct location if os.path.abspath(filepath) == os.path.abspath(target_path): stats['already_correct'] += 1 if not dry_run: update_qdrant_payload(file_hash, filepath, san_name, orig_filename, config) db.mark_organized(file_hash) continue # Resolve collisions final_name, step, is_duplicate = resolve_collision( san_name, target_dir, file_hash, book_author, book_title, orig_filename, db ) if is_duplicate: stats['duplicates'] += 1 if not dry_run: dup_dir = config.get('new_pipeline', {}).get('duplicates_dir', '/mnt/library/_ingest/_duplicates') dup_target = os.path.join(dup_dir, current_filename) try: os.makedirs(dup_dir, exist_ok=True) shutil.move(filepath, dup_target) except Exception: pass collision_path = os.path.join(target_dir, san_name) collision_hash = content_hash(collision_path) if os.path.exists(collision_path) else None db.queue_duplicate_review( doc_hash=file_hash, original_filename=orig_filename, sanitized_filename=san_name, collision_with_hash=collision_hash, collision_path=collision_path, duplicate_path=dup_target, domain=domain, subdomain=subdomain, book_author=book_author, book_title=book_title, ) else: logger.info(" [DRY RUN] DUPLICATE: %s -> _duplicates/ (step %d)", filepath, step) continue final_target = os.path.join(target_dir, final_name) if dry_run: action = 'rename' if os.path.dirname(filepath) == target_dir else 'move' logger.info(" [DRY RUN] %s: %s -> %s (step %d)", action.upper(), filepath, final_target, step) if action == 'rename': stats['renamed'] += 1 else: stats['moved'] += 1 continue # Execute move if move_file(filepath, final_target, file_hash, db, orig_filename, step, notes=f'{domain_name.lower().replace(" ", "_")}_migration'): if os.path.dirname(filepath) == target_dir: stats['renamed'] += 1 else: stats['moved'] += 1 db.update_catalogue_path(file_hash, final_target, final_name) db.sync_document_path(file_hash, final_target, final_name) db.mark_organized(file_hash) points = update_qdrant_payload( file_hash, final_target, final_name, orig_filename, config ) if points > 0: conn = db._get_conn() conn.execute( "UPDATE file_operations SET qdrant_points_updated = ? " "WHERE doc_hash = ? AND reversed_at IS NULL " "ORDER BY performed_at DESC LIMIT 1", (points, file_hash) ) conn.commit() else: stats['failed'] += 1 stats['errors'].append(f"Move failed: {filepath}") logger.info("%s migration complete: total=%d, moved=%d, renamed=%d, " "already_correct=%d, skipped=%d, failed=%d, duplicates=%d, " "domain_mismatch=%d, no_book_title=%d, not_catalogued=%d", domain_name, stats['total'], stats['moved'], stats['renamed'], stats['already_correct'], stats['skipped'], stats['failed'], stats['duplicates'], stats['domain_mismatch'], stats['no_book_title'], stats['not_catalogued']) return stats def migrate_civil_org(db, config, dry_run=False): """Migrate Civil Organization domain. Thin wrapper around migrate_domain().""" return migrate_domain('Civil Organization', db, config, dry_run) _shutdown = False def _signal_handler(sig, frame): global _shutdown _shutdown = True logger.info("Shutdown signal received, finishing current cycle...") def run_watchdog(config): """Polling watchdog loop for the new library pipeline. Runs ingest_scan() every poll_interval seconds. Checks new_pipeline.enabled each cycle — exits if disabled. Handles SIGTERM/SIGINT for graceful shutdown. """ from .utils import setup_logging as _setup_logging _setup_logging('recon.pipeline') global _shutdown _shutdown = False signal.signal(signal.SIGTERM, _signal_handler) signal.signal(signal.SIGINT, _signal_handler) poll_interval = config.get('new_pipeline', {}).get('poll_interval', 60) db = StatusDB() logger.info("Pipeline watchdog started (poll=%ds)", poll_interval) while not _shutdown: # Re-read config each cycle to pick up enable/disable changes try: current_config = get_config() except Exception: current_config = config if not current_config.get('new_pipeline', {}).get('enabled', False): logger.debug("Pipeline disabled, sleeping...") time.sleep(poll_interval) continue try: stats = ingest_scan(db, current_config) if stats['acquired'] or stats['placed'] or stats['failed'] or stats['duplicates']: logger.info("Watchdog cycle: acquired=%d placed=%d failed=%d dupes=%d", stats['acquired'], stats['placed'], stats['failed'], stats['duplicates']) except Exception as e: logger.error("Watchdog cycle error: %s", e) time.sleep(poll_interval) logger.info("Pipeline watchdog stopped") # ── Library Sweep ─────────────────────────────────────────────────────── # Garbage title exact matches (case-insensitive) GARBAGE_TITLES_EXACT = { 'download', 'cmspage', 'plant uses', 'natural resources conservation service', 'national center for home food preservation', 'home', 'index', 'about', 'contact', 'menu', 'page', 'search', 'untitled', } # Garbage title pattern fragments (URL/filename indicators only) GARBAGE_TITLE_FRAGMENTS = ('.html', '.php', '.asp', '://', '\\\\') def is_garbage_title(book_title): """Check if a book_title is known garbage metadata. Returns True if the title matches exact garbage strings, is too short, or contains URL/filename fragments. Bare '/' is allowed (common in titles like "Chemical/Biological" or "Vauxhall/Opel"). """ if not book_title: return True t = book_title.strip() if len(t) < 4: return True if t.lower() in GARBAGE_TITLES_EXACT: return True for frag in GARBAGE_TITLE_FRAGMENTS: if frag in t: return True return False def _sweep_classify_file(filepath, library_root, db, config): """Classify a single file for the sweep. Returns a plan entry dict or None to skip.""" data_dir = config['paths']['data'] try: file_hash = content_hash(filepath) except Exception as e: return {'action': 'error', 'path': filepath, 'error': 'hash_failed: {}'.format(e)} # Look up in documents table doc = db.get_document(file_hash) if not doc: # Not catalogued — rescue to _acquired/ return { 'action': 'rescue', 'path': filepath, 'hash': file_hash, 'target_dir': os.path.join(library_root, '_acquired'), 'filename': os.path.basename(filepath), } status = doc.get('status', '') if status in ('queued', 'extracting', 'enriching', 'embedding'): return {'action': 'skip_in_progress', 'path': filepath, 'hash': file_hash} if status == 'failed': return {'action': 'skip_failed', 'path': filepath, 'hash': file_hash} if status != 'complete': return {'action': 'skip_other_status', 'path': filepath, 'hash': file_hash, 'status': status} # Complete document — classify book_title = doc.get('book_title') book_author = doc.get('book_author') orig_filename = doc.get('filename', os.path.basename(filepath)) # Garbage title filter if is_garbage_title(book_title): return { 'action': 'skip_garbage', 'path': filepath, 'hash': file_hash, 'book_title': book_title, } # Determine domain domain, subdomain, confidence = determine_dominant_domain(file_hash, data_dir) if domain is None: # Unclassifiable — move to _unclassified/ target_dir = os.path.join(library_root, '_unclassified') if book_title: san_name = pipeline_sanitize_filename(book_title + '.pdf', file_hash) else: san_name = pipeline_sanitize_filename(orig_filename, file_hash) if not san_name: san_name = pipeline_sanitize_filename(orig_filename, file_hash) or os.path.basename(filepath) return { 'action': 'unclassified', 'path': filepath, 'hash': file_hash, 'target_dir': target_dir, 'filename': san_name, 'book_title': book_title, 'book_author': book_author, 'orig_filename': orig_filename, 'domain': None, 'subdomain': None, 'confidence': confidence, } # Build target path domain_folder = DOMAIN_FOLDERS.get(domain, normalize_folder_name(domain)) sub_folder = normalize_folder_name(subdomain) if subdomain else 'General' target_dir = os.path.join(library_root, domain_folder, sub_folder) # Standardized filename from book_title if book_title: san_name = pipeline_sanitize_filename(book_title + '.pdf', file_hash) else: san_name = pipeline_sanitize_filename(orig_filename, file_hash) if not san_name: san_name = pipeline_sanitize_filename(orig_filename, file_hash) or os.path.basename(filepath) target_path = os.path.join(target_dir, san_name) # Already at correct location? if os.path.abspath(filepath) == os.path.abspath(target_path): return { 'action': 'no_op', 'path': filepath, 'hash': file_hash, 'domain': domain, 'subdomain': subdomain, } return { 'action': 'relocate', 'path': filepath, 'hash': file_hash, 'target_dir': target_dir, 'filename': san_name, 'book_title': book_title, 'book_author': book_author, 'orig_filename': orig_filename, 'domain': domain, 'subdomain': subdomain, 'confidence': confidence, } def compute_sweep_plan(db, config): """Walk entire library and compute target for each file. Returns (plan_entries, stats) where plan_entries is a list of dicts and stats summarizes counts. """ library_root = config['library_root'] plan = [] stats = { 'total_files': 0, 'relocate': 0, 'rescue': 0, 'unclassified': 0, 'no_op': 0, 'skip_in_progress': 0, 'skip_failed': 0, 'skip_garbage': 0, 'skip_other': 0, 'errors': 0, 'collision_steps': {1: 0, 2: 0, 3: 0, 4: 0}, 'title_frequency': {}, } logger.info("Computing sweep plan — walking %s", library_root) for root, dirs, files in os.walk(library_root): # Skip underscore-prefixed directories dirs[:] = [d for d in dirs if not d.startswith('_')] for fname in files: if not fname.lower().endswith('.pdf'): continue filepath = os.path.join(root, fname) stats['total_files'] += 1 entry = _sweep_classify_file(filepath, library_root, db, config) if entry is None: continue action = entry.get('action', 'skip') if action == 'relocate': # Resolve collision with author-priority strategy san_name = entry['filename'] target_dir = entry['target_dir'] book_author = entry.get('book_author') book_title = entry.get('book_title') orig_filename = entry.get('orig_filename', fname) author_last = extract_author_last_name(book_author) # Author-priority: files WITH author skip step 1 if author_last: stem, ext = os.path.splitext(san_name) name_with_author = '{}_{}{}'.format(stem, author_last, ext) target_path_2 = os.path.join(target_dir, name_with_author) if not os.path.exists(target_path_2): entry['final_name'] = name_with_author entry['collision_step'] = 2 entry['is_duplicate'] = False else: # Check if same file try: existing_hash = content_hash(target_path_2) except Exception: existing_hash = None if existing_hash == entry['hash']: entry['final_name'] = name_with_author entry['collision_step'] = 2 entry['is_duplicate'] = False else: # Try step 3: add year year = extract_year(orig_filename, book_title) if year: name_with_year = '{}_{}_{}{}'.format(stem, author_last, year, ext) target_path_3 = os.path.join(target_dir, name_with_year) if not os.path.exists(target_path_3): entry['final_name'] = name_with_year entry['collision_step'] = 3 entry['is_duplicate'] = False else: try: existing_hash_3 = content_hash(target_path_3) except Exception: existing_hash_3 = None if existing_hash_3 == entry['hash']: entry['final_name'] = name_with_year entry['collision_step'] = 3 entry['is_duplicate'] = False else: entry['final_name'] = san_name entry['collision_step'] = 4 entry['is_duplicate'] = True else: entry['final_name'] = san_name entry['collision_step'] = 4 entry['is_duplicate'] = True else: # No author — use standard resolve_collision (starts at step 1) final_name, step, is_dup = resolve_collision( san_name, target_dir, entry['hash'], book_author, book_title, orig_filename, db ) entry['final_name'] = final_name entry['collision_step'] = step entry['is_duplicate'] = is_dup step = entry.get('collision_step', 1) stats['collision_steps'][step] = stats['collision_steps'].get(step, 0) + 1 stats['relocate'] += 1 # Track title frequency for garbage report bt = entry.get('book_title', '') if bt: stats['title_frequency'][bt] = stats['title_frequency'].get(bt, 0) + 1 elif action == 'rescue': stats['rescue'] += 1 elif action == 'unclassified': stats['unclassified'] += 1 elif action == 'no_op': stats['no_op'] += 1 elif action == 'skip_in_progress': stats['skip_in_progress'] += 1 elif action == 'skip_failed': stats['skip_failed'] += 1 elif action == 'skip_garbage': stats['skip_garbage'] += 1 bt = entry.get('book_title', '') if bt: stats['title_frequency'][bt] = stats['title_frequency'].get(bt, 0) + 1 elif action == 'error': stats['errors'] += 1 else: stats['skip_other'] += 1 plan.append(entry) # Progress logging if stats['total_files'] % 2000 == 0: logger.info("Sweep plan progress: %d files scanned...", stats['total_files']) logger.info("Sweep plan complete: %d files scanned", stats['total_files']) return plan, stats def _save_sweep_plan(plan, stats, output_dir): """Save sweep plan and summary to files.""" os.makedirs(output_dir, exist_ok=True) plan_file = os.path.join(output_dir, 'sweep_plan.json') with open(plan_file, 'w') as f: json.dump(plan, f, indent=2, default=str) logger.info("Plan saved: %s (%d entries)", plan_file, len(plan)) # Summary summary_file = os.path.join(output_dir, 'sweep_summary.md') lines = [ '# Library Sweep Plan Summary', '', '| Metric | Count |', '|--------|-------|', '| Total files scanned | {} |'.format(stats['total_files']), '| Relocate | {} |'.format(stats['relocate']), '| Rescue (uncataloged) | {} |'.format(stats['rescue']), '| Unclassified | {} |'.format(stats['unclassified']), '| No-op (already correct) | {} |'.format(stats['no_op']), '| Skip in-progress | {} |'.format(stats['skip_in_progress']), '| Skip failed | {} |'.format(stats['skip_failed']), '| Skip garbage title | {} |'.format(stats['skip_garbage']), '| Skip other | {} |'.format(stats['skip_other']), '| Errors | {} |'.format(stats['errors']), '', '## Collision Steps', '', '| Step | Count |', '|------|-------|', ] for step in sorted(stats['collision_steps']): lines.append('| {} | {} |'.format(step, stats['collision_steps'][step])) # Top 50 titles by frequency (garbage report) lines.extend([ '', '## Top 50 Titles by Frequency', '', '| Count | Title |', '|-------|-------|', ]) title_freq = sorted(stats['title_frequency'].items(), key=lambda x: -x[1])[:50] for title, count in title_freq: safe_title = title.replace('|', '\\|')[:80] lines.append('| {} | {} |'.format(count, safe_title)) with open(summary_file, 'w') as f: f.write('\n'.join(lines) + '\n') logger.info("Summary saved: %s", summary_file) # Garbage metadata log garbage_entries = [e for e in plan if e.get('action') == 'skip_garbage'] if garbage_entries: garbage_file = os.path.join(output_dir, 'sweep_garbage_metadata.json') with open(garbage_file, 'w') as f: json.dump(garbage_entries, f, indent=2, default=str) logger.info("Garbage metadata log: %s (%d entries)", garbage_file, len(garbage_entries)) return plan_file, summary_file def execute_sweep_plan(db, config, plan_file, batch_size=500, checkpoint_file=None, max_entries=None): """Execute a sweep plan from JSON file. Moves files in batches with checkpointing for resume support. Returns stats dict. """ library_root = config['library_root'] with open(plan_file, 'r') as f: plan = json.load(f) stats = { 'total': len(plan), 'relocated': 0, 'rescued': 0, 'unclassified_moved': 0, 'no_op_marked': 0, 'duplicates': 0, 'skipped': 0, 'failed': 0, 'qdrant_updated': 0, } # Resume from checkpoint start_index = 0 if checkpoint_file and os.path.exists(checkpoint_file): try: with open(checkpoint_file, 'r') as f: cp = json.load(f) start_index = cp.get('last_completed_index', 0) + 1 logger.info("Resuming from checkpoint index %d", start_index) except Exception: pass if not checkpoint_file: checkpoint_file = plan_file.replace('.json', '_checkpoint.json') logger.info("Executing sweep plan: %d entries (starting at %d)", len(plan), start_index) end_index = min(start_index + max_entries, len(plan)) if max_entries else len(plan) for i in range(start_index, end_index): entry = plan[i] action = entry.get('action', 'skip') try: if action == 'relocate': _execute_relocate(entry, db, config, stats) elif action == 'rescue': _execute_rescue(entry, db, config, stats) elif action == 'unclassified': _execute_unclassified(entry, db, config, stats) elif action == 'no_op': _execute_noop(entry, db, config, stats) else: stats['skipped'] += 1 except Exception as e: stats['failed'] += 1 logger.error("Sweep entry %d failed (%s): %s", i, entry.get('path', '?'), e) # Checkpoint if (i + 1) % batch_size == 0: _save_checkpoint(checkpoint_file, i, stats) logger.info("Checkpoint at index %d: relocated=%d rescued=%d failed=%d", i, stats['relocated'], stats['rescued'], stats['failed']) # Final checkpoint _save_checkpoint(checkpoint_file, end_index - 1, stats) logger.info("Sweep execution complete: %s", json.dumps(stats)) return stats def _execute_relocate(entry, db, config, stats): """Execute a relocate operation from the sweep plan.""" filepath = entry['path'] file_hash = entry['hash'] target_dir = entry['target_dir'] final_name = entry.get('final_name', entry['filename']) is_duplicate = entry.get('is_duplicate', False) collision_step = entry.get('collision_step', 1) book_title = entry.get('book_title') book_author = entry.get('book_author') orig_filename = entry.get('orig_filename', os.path.basename(filepath)) domain = entry.get('domain') subdomain = entry.get('subdomain') if not os.path.exists(filepath): logger.warning("Sweep: source missing, skip: %s", filepath) stats['failed'] += 1 return if is_duplicate: # Move to _duplicates/ dup_dir = config.get('new_pipeline', {}).get('duplicates_dir', '/mnt/library/_ingest/_duplicates') dup_target = os.path.join(dup_dir, os.path.basename(filepath)) if os.path.exists(dup_target): stem, ext = os.path.splitext(os.path.basename(filepath)) dup_target = os.path.join(dup_dir, '{}_{}{}'.format(stem, file_hash[:8], ext)) try: os.makedirs(dup_dir, exist_ok=True) shutil.move(filepath, dup_target) except Exception as e: logger.error("Sweep: duplicate move failed %s: %s", filepath, e) stats['failed'] += 1 return collision_path = os.path.join(target_dir, entry['filename']) collision_hash = None if os.path.exists(collision_path): try: collision_hash = content_hash(collision_path) except Exception: pass db.queue_duplicate_review( doc_hash=file_hash, original_filename=orig_filename, sanitized_filename=entry['filename'], collision_with_hash=collision_hash, collision_path=collision_path, duplicate_path=dup_target, domain=domain, subdomain=subdomain, book_author=book_author, book_title=book_title, ) stats['duplicates'] += 1 return target_path = os.path.join(target_dir, final_name) # Re-check: if target now exists (another file moved there during sweep) if os.path.exists(target_path): try: existing_hash = content_hash(target_path) except Exception: existing_hash = None if existing_hash == file_hash: # Same file already there — just update DB db.update_catalogue_path(file_hash, target_path, final_name) db.sync_document_path(file_hash, target_path, final_name) db.mark_organized(file_hash) stats['no_op_marked'] += 1 return else: # Collision appeared during execution — send to duplicate review dup_dir = config.get('new_pipeline', {}).get('duplicates_dir', '/mnt/library/_ingest/_duplicates') dup_target = os.path.join(dup_dir, '{}_{}'.format(file_hash[:8], os.path.basename(filepath))) try: os.makedirs(dup_dir, exist_ok=True) shutil.move(filepath, dup_target) except Exception as e: logger.error("Sweep: runtime collision move failed %s: %s", filepath, e) stats['failed'] += 1 return db.queue_duplicate_review( doc_hash=file_hash, original_filename=orig_filename, sanitized_filename=final_name, collision_with_hash=existing_hash, collision_path=target_path, duplicate_path=dup_target, domain=domain, subdomain=subdomain, book_author=book_author, book_title=book_title, ) stats['duplicates'] += 1 return # Execute move if not move_file(filepath, target_path, file_hash, db, orig_filename, collision_step, notes='sweep'): stats['failed'] += 1 return # Update DB db.update_catalogue_path(file_hash, target_path, final_name) db.sync_document_path(file_hash, target_path, final_name) db.mark_organized(file_hash) # Update Qdrant points = update_qdrant_payload(file_hash, target_path, final_name, orig_filename, config) if points > 0: stats['qdrant_updated'] += points conn = db._get_conn() conn.execute( "UPDATE file_operations SET qdrant_points_updated = ? " "WHERE doc_hash = ? AND reversed_at IS NULL ORDER BY performed_at DESC LIMIT 1", (points, file_hash) ) conn.commit() stats['relocated'] += 1 def _execute_rescue(entry, db, config, stats): """Move an uncataloged file to _acquired/ for watchdog pickup.""" filepath = entry['path'] file_hash = entry['hash'] target_dir = entry['target_dir'] filename = entry['filename'] if not os.path.exists(filepath): stats['failed'] += 1 return target_path = os.path.join(target_dir, filename) if os.path.exists(target_path): stem, ext = os.path.splitext(filename) target_path = os.path.join(target_dir, '{}_{}{}'.format(file_hash[:8], stem, ext)) try: os.makedirs(target_dir, exist_ok=True) shutil.move(filepath, target_path) logger.debug("Rescued %s -> %s", filepath, target_path) stats['rescued'] += 1 except Exception as e: logger.error("Rescue failed %s: %s", filepath, e) stats['failed'] += 1 def _execute_unclassified(entry, db, config, stats): """Move an unclassifiable file to _unclassified/.""" filepath = entry['path'] file_hash = entry['hash'] target_dir = entry['target_dir'] san_name = entry['filename'] orig_filename = entry.get('orig_filename', os.path.basename(filepath)) if not os.path.exists(filepath): stats['failed'] += 1 return target_path = os.path.join(target_dir, san_name) if os.path.exists(target_path): try: existing_hash = content_hash(target_path) except Exception: existing_hash = None if existing_hash == file_hash: db.mark_organized(file_hash) stats['no_op_marked'] += 1 return stem, ext = os.path.splitext(san_name) target_path = os.path.join(target_dir, '{}_{}{}'.format(stem, file_hash[:8], ext)) if not move_file(filepath, target_path, file_hash, db, orig_filename, notes='sweep_unclassified'): stats['failed'] += 1 return final_name = os.path.basename(target_path) db.update_catalogue_path(file_hash, target_path, final_name) db.sync_document_path(file_hash, target_path, final_name) db.mark_organized(file_hash) points = update_qdrant_payload(file_hash, target_path, final_name, orig_filename, config) if points > 0: stats['qdrant_updated'] = stats.get('qdrant_updated', 0) + points stats['unclassified_moved'] += 1 def _execute_noop(entry, db, config, stats): """Handle a no-op: file already at correct location.""" file_hash = entry['hash'] filepath = entry['path'] filename = os.path.basename(filepath) orig_filename = entry.get('orig_filename', filename) # Ensure DB is consistent db.update_catalogue_path(file_hash, filepath, filename) db.sync_document_path(file_hash, filepath, filename) db.mark_organized(file_hash) # Set original_filename in Qdrant if missing update_qdrant_payload(file_hash, filepath, filename, orig_filename, config) stats['no_op_marked'] += 1 def _save_checkpoint(checkpoint_file, index, stats): """Save checkpoint for resume.""" cp = { 'last_completed_index': index, 'stats': stats, 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%S'), } try: with open(checkpoint_file, 'w') as f: json.dump(cp, f, indent=2) except Exception as e: logger.error("Failed to save checkpoint: %s", e) def verify_sweep(db, config): """Verify sweep results: check file_operations from sweep, confirm files exist. Returns (ok_count, discrepancies) where discrepancies is a list of issue dicts. """ conn = db._get_conn() ops = conn.execute( "SELECT * FROM file_operations WHERE notes LIKE '%sweep%' AND reversed_at IS NULL " "ORDER BY performed_at" ).fetchall() ok = 0 issues = [] for op in ops: op = dict(op) target = op['target_path'] source = op['source_path'] doc_hash = op['doc_hash'] # Check target exists if not os.path.exists(target): issues.append({ 'type': 'target_missing', 'op_id': op['id'], 'doc_hash': doc_hash, 'target_path': target, }) continue # Check source is gone if os.path.exists(source) and os.path.abspath(source) != os.path.abspath(target): issues.append({ 'type': 'source_still_exists', 'op_id': op['id'], 'doc_hash': doc_hash, 'source_path': source, 'target_path': target, }) continue # Check DB consistency doc = db.get_document(doc_hash) if doc and doc.get('path') != target: issues.append({ 'type': 'db_path_mismatch', 'op_id': op['id'], 'doc_hash': doc_hash, 'db_path': doc.get('path'), 'target_path': target, }) continue ok += 1 # Domain distribution domain_counts = {} library_root = config['library_root'] for entry in os.scandir(library_root): if entry.is_dir() and not entry.name.startswith('_'): pdf_count = 0 for root, dirs, files in os.walk(entry.path): dirs[:] = [d for d in dirs if not d.startswith('_')] pdf_count += sum(1 for f in files if f.lower().endswith('.pdf')) if pdf_count > 0: domain_counts[entry.name] = pdf_count return ok, issues, domain_counts