""" RECON shared filing logic. Provides file_processed_item() — the shared function that any processor can call to file a completed item from /opt/recon/data/processing/{hash}/ into /mnt/library/Domain/Subdomain/{canonical_name}.{ext}. The function: 1. Reads dominant domain from concept JSONs (existing logic) 2. Derives canonical name (level 1, escalating to 2/3/4 only on collision) 3. Moves the source file from processing/ to library/Domain/Subdomain/ 4. Updates catalogue + documents + Qdrant payloads atomically 5. Marks organized This function does NOT extract, enrich, or embed. Those are upstream stages. This function does NOT touch the legacy organize_document() — that stays in place until cutover (Phase 5). Phase 2: function exists, is tested in isolation. Not yet called by anything in the service loop. """ import logging import os import shutil from .organizer import determine_dominant_domain, _build_target_path from .new_pipeline import update_qdrant_payload logger = logging.getLogger("recon.filing") def file_processed_item(doc_hash, source_file_path, db, config, dry_run=False): """File a completed item into the library. Args: doc_hash: Document hash source_file_path: Current absolute path to the source file (typically in /opt/recon/data/processing/{hash}/ or current library path) db: StatusDB instance config: RECON config dict dry_run: If True, plan but don't move Returns: dict with keys: hash, action, source_path, target_path, domain, subdomain, qdrant_points_updated, error """ result = { "hash": doc_hash, "action": "skip", "source_path": source_file_path, "target_path": None, "domain": None, "subdomain": None, "qdrant_points_updated": 0, "error": None, } # Verify source file exists if not os.path.exists(source_file_path): result["action"] = "error" result["error"] = f"Source file not found: {source_file_path}" return result # Determine domain from existing concept JSONs data_dir = config["paths"]["data"] domain, subdomain, confidence = determine_dominant_domain(doc_hash, data_dir) result["domain"] = domain result["subdomain"] = subdomain if domain is None: result["action"] = "skip_unclassified" return result # Get the original filename from catalogue conn = db._get_conn() row = conn.execute( "SELECT filename FROM catalogue WHERE hash = ?", (doc_hash,) ).fetchone() if not row: result["action"] = "error" result["error"] = f"Hash not in catalogue: {doc_hash}" return result original_filename = row["filename"] # Build target path using existing collision-handling logic library_root = config["library_root"] target_path, sanitized_name = _build_target_path( library_root, domain, subdomain, original_filename, doc_hash ) if target_path is None: result["action"] = "skip_unclassified" return result # Fix 1.1: Preserve the source file's actual extension instead of # the default .pdf that sanitize_filename() may have applied source_ext = os.path.splitext(source_file_path)[1].lower() if source_ext: target_stem, _old_ext = os.path.splitext(target_path) target_path = target_stem + source_ext san_stem, _old_ext = os.path.splitext(sanitized_name) sanitized_name = san_stem + source_ext result["target_path"] = target_path # If already at target (idempotency), just mark organized if os.path.abspath(source_file_path) == os.path.abspath(target_path): result["action"] = "skip_already_filed" if not dry_run: db.mark_organized(doc_hash) return result if dry_run: result["action"] = "would_file" return result # Move the file try: target_dir = os.path.dirname(target_path) os.makedirs(target_dir, exist_ok=True) shutil.move(source_file_path, target_path) except Exception as e: result["action"] = "error" result["error"] = f"Move failed: {e}" logger.error("Move failed for %s: %s", doc_hash[:8], e) return result # Update DB and Qdrant try: db.update_catalogue_path(doc_hash, target_path, sanitized_name) db.sync_document_path(doc_hash, target_path, sanitized_name) db.mark_organized(doc_hash) # Update Qdrant payloads (download_url, filename, original_filename) points = update_qdrant_payload( doc_hash, target_path, sanitized_name, original_filename, config ) result["qdrant_points_updated"] = points result["action"] = "filed" logger.info( "Filed %s -> %s [%s/%s, %d vectors]", doc_hash[:8], target_path, domain, subdomain, points, ) except Exception as e: # File was moved but DB update failed — log the dangerous state result["action"] = "error" result["error"] = f"DB/Qdrant update failed after move: {e}" logger.error("DB/Qdrant update failed for %s: %s", doc_hash[:8], e) return result