mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time migration: rescan library to detect moved files and sync paths to Qdrant.
|
|
|
|
This rescans all PDFs in the library. The upsert in add_to_catalogue() will
|
|
detect any files whose paths changed since they were originally catalogued,
|
|
and flag them with path_updated_at. Then sync_qdrant_paths() propagates
|
|
those path changes to Qdrant download_url payloads.
|
|
|
|
Usage: cd /opt/recon && source venv/bin/activate && python3 migrate_paths.py [--dry-run]
|
|
"""
|
|
import sys
|
|
import os
|
|
|
|
sys.path.insert(0, '/opt/recon')
|
|
|
|
from recon import scan_library, sync_qdrant_paths
|
|
from lib.status import StatusDB
|
|
from lib.utils import setup_logging
|
|
|
|
logger = setup_logging('recon.migrate')
|
|
|
|
|
|
def main():
|
|
dry_run = '--dry-run' in sys.argv
|
|
|
|
db = StatusDB()
|
|
conn = db._get_conn()
|
|
|
|
total_cat = conn.execute("SELECT COUNT(*) FROM catalogue").fetchone()[0]
|
|
total_docs = conn.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
|
|
print(f"Before: {total_cat} catalogue entries, {total_docs} documents")
|
|
|
|
# Rescan library — upsert will detect and flag path changes
|
|
print("\nScanning library (this will re-hash all files)...")
|
|
count = scan_library()
|
|
print(f"Scanned {count} PDFs")
|
|
|
|
# Check how many paths changed
|
|
updates = db.get_path_updates()
|
|
print(f"\nDetected {len(updates)} path changes")
|
|
|
|
if not updates:
|
|
print("No paths need syncing — all up to date")
|
|
return 0
|
|
|
|
# Show what changed
|
|
for row in updates[:20]:
|
|
print(f" {row['hash'][:8]} {row['filename']}")
|
|
if len(updates) > 20:
|
|
print(f" ... and {len(updates) - 20} more")
|
|
|
|
if dry_run:
|
|
print(f"\n[DRY RUN] Would sync {len(updates)} paths to Qdrant. Re-run without --dry-run to apply.")
|
|
return 0
|
|
|
|
# Sync to Qdrant
|
|
print(f"\nSyncing {len(updates)} paths to Qdrant...")
|
|
synced = sync_qdrant_paths()
|
|
print(f"Synced {synced} document paths to Qdrant")
|
|
|
|
# Verify
|
|
remaining = db.get_path_updates()
|
|
if remaining:
|
|
print(f"\nWARNING: {len(remaining)} paths still pending (Qdrant sync may have partially failed)")
|
|
else:
|
|
print("\nAll paths synced successfully")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|