mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
72
migrate_paths.py
Normal file
72
migrate_paths.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python3
|
||||
"""One-time migration: rescan library to detect moved files and sync paths to Qdrant.
|
||||
|
||||
This rescans all PDFs in the library. The upsert in add_to_catalogue() will
|
||||
detect any files whose paths changed since they were originally catalogued,
|
||||
and flag them with path_updated_at. Then sync_qdrant_paths() propagates
|
||||
those path changes to Qdrant download_url payloads.
|
||||
|
||||
Usage: cd /opt/recon && source venv/bin/activate && python3 migrate_paths.py [--dry-run]
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, '/opt/recon')
|
||||
|
||||
from recon import scan_library, sync_qdrant_paths
|
||||
from lib.status import StatusDB
|
||||
from lib.utils import setup_logging
|
||||
|
||||
logger = setup_logging('recon.migrate')
|
||||
|
||||
|
||||
def main():
|
||||
dry_run = '--dry-run' in sys.argv
|
||||
|
||||
db = StatusDB()
|
||||
conn = db._get_conn()
|
||||
|
||||
total_cat = conn.execute("SELECT COUNT(*) FROM catalogue").fetchone()[0]
|
||||
total_docs = conn.execute("SELECT COUNT(*) FROM documents").fetchone()[0]
|
||||
print(f"Before: {total_cat} catalogue entries, {total_docs} documents")
|
||||
|
||||
# Rescan library — upsert will detect and flag path changes
|
||||
print("\nScanning library (this will re-hash all files)...")
|
||||
count = scan_library()
|
||||
print(f"Scanned {count} PDFs")
|
||||
|
||||
# Check how many paths changed
|
||||
updates = db.get_path_updates()
|
||||
print(f"\nDetected {len(updates)} path changes")
|
||||
|
||||
if not updates:
|
||||
print("No paths need syncing — all up to date")
|
||||
return 0
|
||||
|
||||
# Show what changed
|
||||
for row in updates[:20]:
|
||||
print(f" {row['hash'][:8]} {row['filename']}")
|
||||
if len(updates) > 20:
|
||||
print(f" ... and {len(updates) - 20} more")
|
||||
|
||||
if dry_run:
|
||||
print(f"\n[DRY RUN] Would sync {len(updates)} paths to Qdrant. Re-run without --dry-run to apply.")
|
||||
return 0
|
||||
|
||||
# Sync to Qdrant
|
||||
print(f"\nSyncing {len(updates)} paths to Qdrant...")
|
||||
synced = sync_qdrant_paths()
|
||||
print(f"Synced {synced} document paths to Qdrant")
|
||||
|
||||
# Verify
|
||||
remaining = db.get_path_updates()
|
||||
if remaining:
|
||||
print(f"\nWARNING: {len(remaining)} paths still pending (Qdrant sync may have partially failed)")
|
||||
else:
|
||||
print("\nAll paths synced successfully")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue