recon/migrate_paths.py
Matt 563c16bb71 Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 14:57:23 +00:00

72 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""One-time migration: rescan library to detect moved files and sync paths to Qdrant.
This rescans all PDFs in the library. The upsert in add_to_catalogue() will
detect any files whose paths changed since they were originally catalogued,
and flag them with path_updated_at. Then sync_qdrant_paths() propagates
those path changes to Qdrant download_url payloads.
Usage: cd /opt/recon && source venv/bin/activate && python3 migrate_paths.py [--dry-run]
"""
import sys
import os
sys.path.insert(0, '/opt/recon')
from recon import scan_library, sync_qdrant_paths
from lib.status import StatusDB
from lib.utils import setup_logging
logger = setup_logging('recon.migrate')
def main():
dry_run = '--dry-run' in sys.argv
db = StatusDB()
conn = db._get_conn()
total_cat = conn.execute("SELECT COUNT(*) FROM catalogue").fetchone()[0]
total_docs = conn.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
print(f"Before: {total_cat} catalogue entries, {total_docs} documents")
# Rescan library — upsert will detect and flag path changes
print("\nScanning library (this will re-hash all files)...")
count = scan_library()
print(f"Scanned {count} PDFs")
# Check how many paths changed
updates = db.get_path_updates()
print(f"\nDetected {len(updates)} path changes")
if not updates:
print("No paths need syncing — all up to date")
return 0
# Show what changed
for row in updates[:20]:
print(f" {row['hash'][:8]} {row['filename']}")
if len(updates) > 20:
print(f" ... and {len(updates) - 20} more")
if dry_run:
print(f"\n[DRY RUN] Would sync {len(updates)} paths to Qdrant. Re-run without --dry-run to apply.")
return 0
# Sync to Qdrant
print(f"\nSyncing {len(updates)} paths to Qdrant...")
synced = sync_qdrant_paths()
print(f"Synced {synced} document paths to Qdrant")
# Verify
remaining = db.get_path_updates()
if remaining:
print(f"\nWARNING: {len(remaining)} paths still pending (Qdrant sync may have partially failed)")
else:
print("\nAll paths synced successfully")
return 0
if __name__ == '__main__':
sys.exit(main())