recon/lib/organizer.py

"""
RECON Library Organizer

After a document completes the pipeline (extract -> enrich -> embed),
this module classifies it by dominant domain and moves it into the
correct Domain/Subdomain/ folder with a sanitized filename.

Two modes:
  1. Per-document: determine_dominant_domain() from on-disk concept JSONs
  2. Bulk manifest: organize_from_manifest() using pre-built manifest JSON

Path updates trigger the existing catalogue.path_updated_at mechanism,
which sync_qdrant_paths() propagates to Qdrant payloads.
"""
import json
import logging
import os
import shutil
from collections import Counter

from .utils import sanitize_filename

logger = logging.getLogger('recon.organizer')

# ── Domain folder mapping (canonical) ───────────────────────────────────
# Keys = exact domain strings from Gemini enrichment
# Values = filesystem-safe folder names

DOMAIN_FOLDERS = {
    'Agriculture & Livestock': 'Agriculture-and-Livestock',
    'Civil Organization': 'Civil-Organization',
    'Communications': 'Communications',
    'Food Systems': 'Food-Systems',
    'Foundational Skills': 'Foundational-Skills',
    'Logistics': 'Logistics',
    'Medical': 'Medical',
    'Navigation': 'Navigation',
    'Operations': 'Operations',
    'Power Systems': 'Power-Systems',
    'Preservation & Storage': 'Preservation-and-Storage',
    'Security': 'Security',
    'Shelter & Construction': 'Shelter-and-Construction',
    'Technology': 'Technology',
    'Tools & Equipment': 'Tools-and-Equipment',
    'Vehicles': 'Vehicles',
    'Water Systems': 'Water-Systems',
    'Wilderness Skills': 'Wilderness-Skills',
}


def normalize_folder_name(name):
    """Normalize a domain/subdomain name to a folder-safe string.

    Examples:
        'Edible Plants & Foraging' -> 'Edible-Plants-and-Foraging'
        'emergency medicine' -> 'Emergency-Medicine'
    """
    if not name:
        return 'Uncategorized'
    name = name.strip()
    name = name.replace('&', 'and')
    words = name.split()
    titled = []
    for w in words:
        if w.lower() in ('and', 'of', 'the', 'to', 'for', 'in', 'on', 'at'):
            titled.append(w.lower())
        else:
            titled.append(w.capitalize())
    return '-'.join(titled)


def determine_dominant_domain(doc_hash, data_dir):
    """Determine a document's dominant domain from on-disk concept JSONs.

    Reads all /data/concepts/{hash}/window_*.json files, counts domain
    occurrences across all concepts, returns the top domain.

    Args:
        doc_hash: Document hash
        data_dir: Path to /opt/recon/data

    Returns:
        (domain, subdomain, confidence) tuple.
        domain/subdomain are strings or None.
        confidence is float 0-1 (top domain count / total concepts).
    """
    concepts_dir = os.path.join(data_dir, 'concepts', doc_hash)
    if not os.path.isdir(concepts_dir):
        return (None, None, 0.0)

    domain_counter = Counter()
    subdomain_counter = Counter()
    total_concepts = 0

    for fname in os.listdir(concepts_dir):
        if not fname.startswith('window_') or not fname.endswith('.json'):
            continue
        fpath = os.path.join(concepts_dir, fname)
        try:
            with open(fpath, 'r') as f:
                concepts = json.load(f)
        except (json.JSONDecodeError, OSError):
            continue

        if not isinstance(concepts, list):
            continue

        for concept in concepts:
            total_concepts += 1
            # domain is usually a list with one element
            dom = concept.get('domain')
            if isinstance(dom, list):
                for d in dom:
                    if isinstance(d, str):
                        domain_counter[d] += 1
            elif isinstance(dom, str):
                domain_counter[dom] += 1

            sub = concept.get('subdomain')
            if isinstance(sub, list):
                for s in sub:
                    if isinstance(s, str):
                        subdomain_counter[s] += 1
            elif isinstance(sub, str):
                subdomain_counter[sub] += 1

    if total_concepts == 0 or not domain_counter:
        return (None, None, 0.0)

    top_domains = domain_counter.most_common(2)
    dom_name = top_domains[0][0]
    dom_count = top_domains[0][1]
    confidence = dom_count / total_concepts

    # Check ambiguity
    is_ambiguous = False
    if len(top_domains) >= 2:
        dom2_count = top_domains[1][1]
        if dom2_count >= dom_count * 0.8:
            is_ambiguous = True
    if confidence < 0.4:
        is_ambiguous = True

    if is_ambiguous:
        return (None, None, confidence)

    top_sub = subdomain_counter.most_common(1)
    sub_name = top_sub[0][0] if top_sub else None

    return (dom_name, sub_name, confidence)


def _build_target_path(library_root, domain, subdomain, filename, doc_hash):
    """Build the target path for a document, handling domain mapping and collisions.

    Returns:
        (target_path, sanitized_filename) tuple
    """
    san_name = sanitize_filename(filename, doc_hash=doc_hash)

    if domain is None:
        # Unclassified — leave in place (don't move to Review folder for pipeline)
        return (None, san_name)

    domain_folder = DOMAIN_FOLDERS.get(domain)
    if not domain_folder:
        domain_folder = normalize_folder_name(domain)

    if subdomain:
        sub_folder = normalize_folder_name(subdomain)
    else:
        sub_folder = 'General'

    target_dir = os.path.join(library_root, domain_folder, sub_folder)
    target_path = os.path.join(target_dir, san_name)

    # Handle collision at target
    if os.path.exists(target_path):
        stem, ext = os.path.splitext(san_name)
        h6 = doc_hash[:6]
        new_name = '{} [{}]{}'.format(stem, h6, ext)
        if len(new_name) > 120:
            max_stem = 120 - len(ext) - 9
            stem = stem[:max_stem].rstrip('. -,')
            new_name = '{} [{}]{}'.format(stem, h6, ext)
        san_name = new_name
        target_path = os.path.join(target_dir, san_name)

    return (target_path, san_name)


def organize_document(doc_hash, db, config, dry_run=False):
    """Organize a single document: classify, rename, and move.

    Args:
        doc_hash: Document hash
        db: StatusDB instance
        config: RECON config dict
        dry_run: If True, don't actually move files

    Returns:
        dict with keys: hash, action, before_path, after_path, domain, subdomain, error
    """
    library_root = config['library_root']
    data_dir = config['paths']['data']

    result = {
        'hash': doc_hash,
        'action': 'skip',
        'before_path': None,
        'after_path': None,
        'domain': None,
        'subdomain': None,
        'error': None,
    }

    # Look up current path from catalogue
    conn = db._get_conn()
    row = conn.execute(
        "SELECT path, filename FROM catalogue WHERE hash = ?", (doc_hash,)
    ).fetchone()
    if not row:
        result['error'] = 'Not in catalogue'
        return result

    current_path = row['path']
    current_filename = row['filename']
    result['before_path'] = current_path

    # Verify file exists on disk
    if not dry_run and not os.path.exists(current_path):
        result['error'] = 'File not found on disk'
        return result

    # Determine domain from concept JSONs
    domain, subdomain, confidence = determine_dominant_domain(doc_hash, data_dir)
    result['domain'] = domain
    result['subdomain'] = subdomain

    if domain is None:
        result['action'] = 'skip_unclassified'
        return result

    # Build target path
    target_path, san_name = _build_target_path(
        library_root, domain, subdomain, current_filename, doc_hash
    )

    if target_path is None:
        result['action'] = 'skip_unclassified'
        return result

    result['after_path'] = target_path

    # Already at target?
    if os.path.abspath(current_path) == os.path.abspath(target_path):
        result['action'] = 'already_organized'
        # Still mark as organized
        if not dry_run:
            db.mark_organized(doc_hash)
        return result

    if dry_run:
        result['action'] = 'would_move'
        return result

    # Move the file
    try:
        target_dir = os.path.dirname(target_path)
        os.makedirs(target_dir, exist_ok=True)
        shutil.move(current_path, target_path)

        # Update catalogue (triggers path_updated_at for Qdrant sync)
        db.update_catalogue_path(doc_hash, target_path, san_name)
        db.mark_organized(doc_hash)

        result['action'] = 'moved'
        logger.info("Organized %s -> %s [%s/%s]",
                     doc_hash[:8], target_path, domain, subdomain)
    except Exception as e:
        result['action'] = 'error'
        result['error'] = str(e)
        logger.error("Failed to organize %s: %s", doc_hash[:8], e)

    return result


def organize_from_manifest(manifest_path, db, config, dry_run=False):
    """Bulk migration using a pre-built manifest JSON.

    The manifest is produced by recon_manifest_builder.py and contains
    entries with current_path, sanitized_path, sanitized_filename, hash, etc.

    Args:
        manifest_path: Path to manifest JSON file
        db: StatusDB instance
        config: RECON config dict
        dry_run: If True, don't actually move files

    Returns:
        dict with summary stats: moved, skipped, errors, already_organized, total
    """
    with open(manifest_path, 'r') as f:
        entries = json.load(f)

    stats = {
        'total': len(entries),
        'moved': 0,
        'skipped': 0,
        'already_organized': 0,
        'errors': 0,
        'not_found': 0,
    }

    for i, entry in enumerate(entries):
        doc_hash = entry['hash']
        current_path = entry['current_path']
        target_path = entry.get('sanitized_path', entry.get('proposed_path'))
        san_name = entry.get('sanitized_filename', entry.get('filename'))

        if not target_path or not san_name:
            stats['skipped'] += 1
            continue

        # Skip ambiguous entries
        if entry.get('ambiguous'):
            stats['skipped'] += 1
            continue

        # Already at target?
        if os.path.abspath(current_path) == os.path.abspath(target_path):
            stats['already_organized'] += 1
            if not dry_run:
                db.mark_organized(doc_hash)
            continue

        if dry_run:
            stats['moved'] += 1
            continue

        # Verify source exists
        if not os.path.exists(current_path):
            stats['not_found'] += 1
            logger.warning("Manifest: file not found: %s [%s]", current_path, doc_hash[:8])
            continue

        try:
            target_dir = os.path.dirname(target_path)
            os.makedirs(target_dir, exist_ok=True)

            # Check for collision at target (different file already there)
            if os.path.exists(target_path):
                stem, ext = os.path.splitext(san_name)
                h6 = doc_hash[:6]
                san_name = '{} [{}]{}'.format(stem, h6, ext)
                target_path = os.path.join(target_dir, san_name)

            shutil.move(current_path, target_path)

            # Update catalogue + mark organized
            db.update_catalogue_path(doc_hash, target_path, san_name)
            db.mark_organized(doc_hash)
            stats['moved'] += 1

        except Exception as e:
            stats['errors'] += 1
            logger.error("Manifest: failed to move %s: %s", doc_hash[:8], e)

        # Progress reporting
        if (i + 1) % 1000 == 0:
            logger.info("Manifest progress: %d / %d (moved=%d, errors=%d)",
                        i + 1, stats['total'], stats['moved'], stats['errors'])

    return stats