mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
1502 lines
56 KiB
Python
1502 lines
56 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
RECON CLI — Main entry point.
|
||
|
|
|
||
|
|
Subcommands: scan, queue, extract, enrich, embed, run, search, upload,
|
||
|
|
ingest-url, crawl, ingest-peertube, organize, status, catalogue, failures, validate, rebuild, serve, ingest.
|
||
|
|
|
||
|
|
Usage: cd /opt/recon && source venv/bin/activate && python3 recon.py <command>
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import signal
|
||
|
|
import shutil
|
||
|
|
import sys
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
|
||
|
|
from lib.utils import get_config, content_hash, derive_source_and_category, generate_download_url, setup_logging
|
||
|
|
from lib.status import StatusDB
|
||
|
|
|
||
|
|
logger = setup_logging('recon.cli')
|
||
|
|
|
||
|
|
|
||
|
|
# ── Standalone functions (used by both CLI and service) ──────────────────
|
||
|
|
|
||
|
|
def scan_library(path=None):
|
||
|
|
"""Scan library tree and catalogue new PDFs. Returns count of PDFs catalogued."""
|
||
|
|
config = get_config()
|
||
|
|
db = StatusDB()
|
||
|
|
library_root = config['library_root']
|
||
|
|
scan_root = path or library_root
|
||
|
|
|
||
|
|
if not os.path.exists(scan_root):
|
||
|
|
logger.warning(f"Scan path not found: {scan_root}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
count = 0
|
||
|
|
for root, dirs, files in os.walk(scan_root):
|
||
|
|
dirs[:] = [d for d in dirs if not d.startswith('_')] # Skip staging dirs (_acquired, _ingest)
|
||
|
|
for fname in files:
|
||
|
|
if not fname.lower().endswith('.pdf'):
|
||
|
|
continue
|
||
|
|
filepath = os.path.join(root, fname)
|
||
|
|
try:
|
||
|
|
fhash = content_hash(filepath)
|
||
|
|
size = os.path.getsize(filepath)
|
||
|
|
source, category = derive_source_and_category(filepath, library_root)
|
||
|
|
db.add_to_catalogue(fhash, fname, filepath, size, source, category)
|
||
|
|
count += 1
|
||
|
|
except Exception as e:
|
||
|
|
logger.warning(f"Failed to catalogue {filepath}: {e}")
|
||
|
|
|
||
|
|
return count
|
||
|
|
|
||
|
|
|
||
|
|
def queue_all():
|
||
|
|
"""Queue all unprocessed catalogue items. Returns count queued."""
|
||
|
|
db = StatusDB()
|
||
|
|
items = db.get_catalogued()
|
||
|
|
queued = 0
|
||
|
|
for item in items:
|
||
|
|
if db.queue_document(item['hash']):
|
||
|
|
queued += 1
|
||
|
|
return queued
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
def sync_qdrant_paths():
|
||
|
|
"""Sync updated file paths to Qdrant vector payloads.
|
||
|
|
|
||
|
|
After scan_library() upserts catalogue paths, any path changes are flagged
|
||
|
|
with path_updated_at. This function propagates those path changes to the
|
||
|
|
Qdrant download_url payload so citations stay valid.
|
||
|
|
|
||
|
|
Returns count of Qdrant points updated.
|
||
|
|
"""
|
||
|
|
from qdrant_client import QdrantClient
|
||
|
|
from qdrant_client.models import FieldCondition, MatchValue, Filter
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
db = StatusDB()
|
||
|
|
|
||
|
|
updates = db.get_path_updates()
|
||
|
|
if not updates:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
library_root = config['library_root']
|
||
|
|
qdrant = QdrantClient(
|
||
|
|
host=config['vector_db']['host'],
|
||
|
|
port=config['vector_db']['port'],
|
||
|
|
timeout=60
|
||
|
|
)
|
||
|
|
collection = config['vector_db']['collection']
|
||
|
|
|
||
|
|
synced = 0
|
||
|
|
for row in updates:
|
||
|
|
doc_hash = row['hash']
|
||
|
|
new_path = row['path']
|
||
|
|
new_filename = row['filename']
|
||
|
|
|
||
|
|
# Update documents table path
|
||
|
|
db.sync_document_path(doc_hash, new_path, new_filename)
|
||
|
|
|
||
|
|
# Build new download URL
|
||
|
|
new_url = generate_download_url(new_path, library_root)
|
||
|
|
|
||
|
|
# Find all Qdrant points for this document
|
||
|
|
try:
|
||
|
|
hits = qdrant.scroll(
|
||
|
|
collection_name=collection,
|
||
|
|
scroll_filter=Filter(must=[
|
||
|
|
FieldCondition(key="doc_hash", match=MatchValue(value=doc_hash))
|
||
|
|
]),
|
||
|
|
limit=1000,
|
||
|
|
with_payload=False,
|
||
|
|
)
|
||
|
|
point_ids = [p.id for p in hits[0]]
|
||
|
|
|
||
|
|
if point_ids:
|
||
|
|
qdrant.set_payload(
|
||
|
|
collection_name=collection,
|
||
|
|
payload={"download_url": new_url, "filename": new_filename},
|
||
|
|
points=point_ids,
|
||
|
|
)
|
||
|
|
logger.info(f" Synced {len(point_ids)} vectors for {new_filename} -> {new_url}")
|
||
|
|
|
||
|
|
db.clear_path_update(doc_hash)
|
||
|
|
synced += 1
|
||
|
|
except Exception as e:
|
||
|
|
logger.warning(f" Failed to sync Qdrant paths for {doc_hash}: {e}")
|
||
|
|
|
||
|
|
if synced:
|
||
|
|
logger.info(f"[scanner] Synced {synced} document paths to Qdrant")
|
||
|
|
return synced
|
||
|
|
|
||
|
|
|
||
|
|
# ── CLI command wrappers ─────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def cmd_scan(args):
|
||
|
|
count = scan_library(path=args.path)
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
print(f"Scanned {count} PDFs into catalogue, {synced} paths synced to Qdrant")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_queue(args):
|
||
|
|
db = StatusDB()
|
||
|
|
|
||
|
|
if args.hash:
|
||
|
|
if db.queue_document(args.hash):
|
||
|
|
print(f"Queued: {args.hash}")
|
||
|
|
else:
|
||
|
|
print(f"Not found in catalogue: {args.hash}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
items = db.get_catalogued(
|
||
|
|
source=args.source,
|
||
|
|
category=args.category,
|
||
|
|
limit=args.limit
|
||
|
|
)
|
||
|
|
|
||
|
|
if not items:
|
||
|
|
print("No catalogued items match criteria")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
queued = 0
|
||
|
|
for item in items:
|
||
|
|
if db.queue_document(item['hash']):
|
||
|
|
queued += 1
|
||
|
|
|
||
|
|
print(f"Queued {queued} documents for processing")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_extract(args):
|
||
|
|
from lib.extractor import run_extraction
|
||
|
|
success = run_extraction(workers=args.workers)
|
||
|
|
print(f"Extraction complete: {success} documents processed")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_enrich(args):
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
success = run_enrichment(workers=args.workers, limit=args.limit)
|
||
|
|
print(f"Enrichment complete: {success} documents processed")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_embed(args):
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
success = run_embedding(workers=args.workers, limit=args.limit)
|
||
|
|
print(f"Embedding complete: {success} documents processed")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_run(args):
|
||
|
|
"""Run all pipeline stages concurrently.
|
||
|
|
|
||
|
|
Each stage runs in its own thread, polling for work independently:
|
||
|
|
- Extract: queued -> extracted
|
||
|
|
- Enrich: extracted -> enriched
|
||
|
|
- Embed: enriched -> complete
|
||
|
|
|
||
|
|
Documents flow through continuously without waiting for a stage to finish.
|
||
|
|
"""
|
||
|
|
from lib.extractor import run_extraction
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
extract_workers = args.workers
|
||
|
|
enrich_workers = args.enrich_workers or config.get('processing', {}).get('enrich_workers', 16)
|
||
|
|
embed_workers = args.workers
|
||
|
|
poll_interval = 30
|
||
|
|
|
||
|
|
stop_event = threading.Event()
|
||
|
|
totals = {'extract': 0, 'enrich': 0, 'embed': 0}
|
||
|
|
|
||
|
|
def _doc_counts():
|
||
|
|
db = StatusDB()
|
||
|
|
raw = db.get_status_counts()
|
||
|
|
return raw.get('documents', {})
|
||
|
|
|
||
|
|
def _upstream_done(stage, counts):
|
||
|
|
"""Check if all upstream stages are finished feeding this stage."""
|
||
|
|
if stage == 'extract':
|
||
|
|
return counts.get('queued', 0) == 0
|
||
|
|
elif stage == 'enrich':
|
||
|
|
return (counts.get('queued', 0) == 0
|
||
|
|
and counts.get('extracting', 0) == 0
|
||
|
|
and counts.get('extracted', 0) == 0)
|
||
|
|
elif stage == 'embed':
|
||
|
|
return (counts.get('queued', 0) == 0
|
||
|
|
and counts.get('extracting', 0) == 0
|
||
|
|
and counts.get('extracted', 0) == 0
|
||
|
|
and counts.get('enriching', 0) == 0
|
||
|
|
and counts.get('enriched', 0) == 0)
|
||
|
|
return False
|
||
|
|
|
||
|
|
def stage_loop(name, process_fn):
|
||
|
|
"""Run a stage: process available work, sleep, repeat until done."""
|
||
|
|
logger.info(f"[{name}] Stage started")
|
||
|
|
idle_cycles = 0
|
||
|
|
|
||
|
|
while not stop_event.is_set():
|
||
|
|
try:
|
||
|
|
processed = process_fn()
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[{name}] Error: {e}")
|
||
|
|
processed = 0
|
||
|
|
|
||
|
|
if processed and processed > 0:
|
||
|
|
totals[name] += processed
|
||
|
|
idle_cycles = 0
|
||
|
|
logger.info(f"[{name}] Batch done: {processed} docs (total: {totals[name]})")
|
||
|
|
continue # immediately check for more
|
||
|
|
|
||
|
|
idle_cycles += 1
|
||
|
|
|
||
|
|
# After 2 idle polls, check if upstream is finished
|
||
|
|
if idle_cycles >= 2:
|
||
|
|
counts = _doc_counts()
|
||
|
|
if _upstream_done(name, counts):
|
||
|
|
logger.info(f"[{name}] No upstream work remaining, exiting "
|
||
|
|
f"(total: {totals[name]})")
|
||
|
|
break
|
||
|
|
|
||
|
|
stop_event.wait(poll_interval)
|
||
|
|
|
||
|
|
logger.info(f"[{name}] Stage finished — {totals[name]} documents processed")
|
||
|
|
|
||
|
|
threads = [
|
||
|
|
threading.Thread(
|
||
|
|
target=stage_loop, daemon=True, name='extract',
|
||
|
|
args=('extract', lambda: run_extraction(workers=extract_workers)),
|
||
|
|
),
|
||
|
|
threading.Thread(
|
||
|
|
target=stage_loop, daemon=True, name='enrich',
|
||
|
|
args=('enrich', lambda: run_enrichment(workers=enrich_workers)),
|
||
|
|
),
|
||
|
|
threading.Thread(
|
||
|
|
target=stage_loop, daemon=True, name='embed',
|
||
|
|
args=('embed', lambda: run_embedding(workers=embed_workers)),
|
||
|
|
),
|
||
|
|
]
|
||
|
|
|
||
|
|
logger.info("=== RECON Pipeline Starting (concurrent) ===")
|
||
|
|
logger.info(f" Extract: {extract_workers} workers | "
|
||
|
|
f"Enrich: {enrich_workers} workers | "
|
||
|
|
f"Embed: {embed_workers} workers")
|
||
|
|
|
||
|
|
for t in threads:
|
||
|
|
t.start()
|
||
|
|
|
||
|
|
try:
|
||
|
|
while any(t.is_alive() for t in threads):
|
||
|
|
time.sleep(60)
|
||
|
|
c = _doc_counts()
|
||
|
|
logger.info(
|
||
|
|
f"[pipeline] queued={c.get('queued', 0)} "
|
||
|
|
f"extracting={c.get('extracting', 0)} "
|
||
|
|
f"extracted={c.get('extracted', 0)} "
|
||
|
|
f"enriching={c.get('enriching', 0)} "
|
||
|
|
f"enriched={c.get('enriched', 0)} "
|
||
|
|
f"embedding={c.get('embedding', 0)} "
|
||
|
|
f"complete={c.get('complete', 0)} "
|
||
|
|
f"failed={c.get('failed', 0)}"
|
||
|
|
)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("Pipeline interrupted, stopping stages...")
|
||
|
|
stop_event.set()
|
||
|
|
for t in threads:
|
||
|
|
t.join(timeout=30)
|
||
|
|
|
||
|
|
logger.info(f"=== RECON Pipeline Complete: "
|
||
|
|
f"{totals['extract']} extracted, "
|
||
|
|
f"{totals['enrich']} enriched, "
|
||
|
|
f"{totals['embed']} embedded ===")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_status(args):
|
||
|
|
db = StatusDB()
|
||
|
|
counts = db.get_status_counts()
|
||
|
|
|
||
|
|
print("=== RECON Status ===\n")
|
||
|
|
|
||
|
|
cat = counts.get('catalogue', {})
|
||
|
|
print("Catalogue:")
|
||
|
|
for status, count in sorted(cat.items()):
|
||
|
|
print(f" {status}: {count}")
|
||
|
|
print(f" TOTAL: {sum(cat.values())}")
|
||
|
|
|
||
|
|
doc = counts.get('documents', {})
|
||
|
|
print("\nPipeline:")
|
||
|
|
for status in ['queued', 'extracting', 'extracted', 'enriching', 'enriched', 'embedding', 'complete', 'failed']:
|
||
|
|
count = doc.get(status, 0)
|
||
|
|
if count > 0:
|
||
|
|
print(f" {status}: {count}")
|
||
|
|
print(f" TOTAL: {sum(doc.values())}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_catalogue(args):
|
||
|
|
db = StatusDB()
|
||
|
|
|
||
|
|
if args.sources:
|
||
|
|
sources = db.source_breakdown()
|
||
|
|
print(f"{'Source':<30} {'Count':>6} {'Size (MB)':>10}")
|
||
|
|
print("-" * 50)
|
||
|
|
for s in sources:
|
||
|
|
size_mb = (s.get('total_bytes', 0) or 0) / (1024 * 1024)
|
||
|
|
print(f"{s['source']:<30} {s['count']:>6} {size_mb:>10.1f}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
if args.categories:
|
||
|
|
cats = db.category_breakdown(source=args.source)
|
||
|
|
for c in cats:
|
||
|
|
src = c.get('source', args.source or '')
|
||
|
|
print(f" {src}/{c['category']}: {c['count']}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
items = db.get_catalogued(
|
||
|
|
source=args.source,
|
||
|
|
category=args.category,
|
||
|
|
limit=args.limit or 50
|
||
|
|
)
|
||
|
|
|
||
|
|
for item in items:
|
||
|
|
size_mb = (item.get('size_bytes', 0) or 0) / (1024 * 1024)
|
||
|
|
print(f" [{item['hash'][:8]}] {item['filename']:<60} {size_mb:>8.1f} MB {item['source']}/{item['category']}")
|
||
|
|
|
||
|
|
print(f"\nShowing {len(items)} items")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_failures(args):
|
||
|
|
db = StatusDB()
|
||
|
|
failures = db.get_failures()
|
||
|
|
|
||
|
|
if not failures:
|
||
|
|
print("No failures")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
for f in failures:
|
||
|
|
print(f" [{f['hash'][:8]}] {f['filename']}")
|
||
|
|
print(f" Error: {f.get('error_message', 'unknown')}")
|
||
|
|
print(f" Retries: {f.get('retry_count', 0)}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
print(f"Total failures: {len(failures)}")
|
||
|
|
|
||
|
|
if args.retry:
|
||
|
|
for f in failures:
|
||
|
|
db.increment_retry(f['hash'])
|
||
|
|
print(f"Re-queued {len(failures)} documents")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_search(args):
|
||
|
|
from qdrant_client import QdrantClient
|
||
|
|
from lib.embedder import get_embedding_single
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
query = ' '.join(args.query)
|
||
|
|
query_vector = get_embedding_single(query, config)
|
||
|
|
|
||
|
|
qdrant = QdrantClient(
|
||
|
|
host=config['vector_db']['host'],
|
||
|
|
port=config['vector_db']['port'],
|
||
|
|
timeout=60
|
||
|
|
)
|
||
|
|
|
||
|
|
response = qdrant.query_points(
|
||
|
|
collection_name=config['vector_db']['collection'],
|
||
|
|
query=query_vector,
|
||
|
|
limit=args.limit
|
||
|
|
)
|
||
|
|
results = response.points
|
||
|
|
|
||
|
|
if not results:
|
||
|
|
print("No results found")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
for i, r in enumerate(results, 1):
|
||
|
|
p = r.payload
|
||
|
|
print(f"\n--- Result {i} (score: {r.score:.4f}) ---")
|
||
|
|
print(f" Title: {p.get('title', 'Untitled')}")
|
||
|
|
print(f" Book: {p.get('book_title', p.get('filename', '?'))}")
|
||
|
|
print(f" Type: {p.get('source_type', 'document')}")
|
||
|
|
summary = p.get('summary', '')
|
||
|
|
if summary:
|
||
|
|
print(f" Summary: {summary[:200]}")
|
||
|
|
domains = p.get('domain', [])
|
||
|
|
if domains:
|
||
|
|
print(f" Domains: {', '.join(domains) if isinstance(domains, list) else domains}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_upload(args):
|
||
|
|
config = get_config()
|
||
|
|
db = StatusDB()
|
||
|
|
library_root = config['library_root']
|
||
|
|
upload_paths = config.get('upload_paths', {})
|
||
|
|
category = args.category or ''
|
||
|
|
|
||
|
|
def _resolve_path(cat):
|
||
|
|
if cat in upload_paths:
|
||
|
|
return upload_paths[cat]
|
||
|
|
default_path = upload_paths.get('default', library_root)
|
||
|
|
if cat:
|
||
|
|
from werkzeug.utils import secure_filename
|
||
|
|
safe = secure_filename(cat)
|
||
|
|
if safe:
|
||
|
|
return os.path.join(default_path, safe)
|
||
|
|
return default_path
|
||
|
|
|
||
|
|
def _upload_one(filepath):
|
||
|
|
filename = os.path.basename(filepath)
|
||
|
|
if not filename.lower().endswith('.pdf'):
|
||
|
|
print(f" SKIP (not PDF): {filename}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
file_hash = content_hash(filepath)
|
||
|
|
|
||
|
|
# Check duplicate
|
||
|
|
conn = db._get_conn()
|
||
|
|
existing = conn.execute("SELECT filename FROM catalogue WHERE hash = ?", (file_hash,)).fetchone()
|
||
|
|
if existing:
|
||
|
|
print(f" DUPLICATE: {filename} (matches {existing['filename']})")
|
||
|
|
return False
|
||
|
|
|
||
|
|
target_dir = _resolve_path(category)
|
||
|
|
os.makedirs(target_dir, exist_ok=True)
|
||
|
|
|
||
|
|
dest = os.path.join(target_dir, filename)
|
||
|
|
if os.path.exists(dest):
|
||
|
|
base, ext = os.path.splitext(filename)
|
||
|
|
dest = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}")
|
||
|
|
|
||
|
|
shutil.copy2(filepath, dest)
|
||
|
|
size = os.path.getsize(dest)
|
||
|
|
source, derived_cat = derive_source_and_category(dest, library_root)
|
||
|
|
|
||
|
|
db.add_to_catalogue(file_hash, filename, dest, size, source, derived_cat)
|
||
|
|
db.queue_document(file_hash)
|
||
|
|
|
||
|
|
print(f" QUEUED: {filename} -> {source}/{derived_cat} [{file_hash[:8]}]")
|
||
|
|
return True
|
||
|
|
|
||
|
|
uploaded = 0
|
||
|
|
|
||
|
|
if args.file:
|
||
|
|
if not os.path.isfile(args.file):
|
||
|
|
print(f"File not found: {args.file}")
|
||
|
|
return 1
|
||
|
|
if _upload_one(args.file):
|
||
|
|
uploaded += 1
|
||
|
|
|
||
|
|
elif args.dir:
|
||
|
|
if not os.path.isdir(args.dir):
|
||
|
|
print(f"Directory not found: {args.dir}")
|
||
|
|
return 1
|
||
|
|
for fname in sorted(os.listdir(args.dir)):
|
||
|
|
fpath = os.path.join(args.dir, fname)
|
||
|
|
if os.path.isfile(fpath) and fname.lower().endswith('.pdf'):
|
||
|
|
if _upload_one(fpath):
|
||
|
|
uploaded += 1
|
||
|
|
else:
|
||
|
|
print("Specify --file or --dir")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
print(f"\nUploaded {uploaded} PDF(s)")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_ingest_url(args):
|
||
|
|
from lib.web_scraper import ingest_url, ingest_urls
|
||
|
|
|
||
|
|
urls = []
|
||
|
|
|
||
|
|
if args.url:
|
||
|
|
urls.append(args.url)
|
||
|
|
|
||
|
|
if args.file:
|
||
|
|
with open(args.file) as f:
|
||
|
|
urls.extend([line.strip() for line in f if line.strip() and not line.startswith('#')])
|
||
|
|
|
||
|
|
if not urls:
|
||
|
|
print("Error: Provide a URL argument or --file with URLs")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
print(f"Ingesting {len(urls)} URL(s) into category '{args.category}'...")
|
||
|
|
|
||
|
|
if len(urls) == 1:
|
||
|
|
try:
|
||
|
|
result = ingest_url(urls[0], category=args.category, source=args.source)
|
||
|
|
status = result.get('status', 'unknown').upper()
|
||
|
|
print(f" {status}: {result.get('title', 'Untitled')}")
|
||
|
|
print(f" Hash: {result['hash'][:16]}...")
|
||
|
|
if result.get('page_count'):
|
||
|
|
print(f" Pages: {result['page_count']}")
|
||
|
|
if result.get('existing_status'):
|
||
|
|
print(f" Existing status: {result['existing_status']}")
|
||
|
|
except Exception as e:
|
||
|
|
print(f" FAILED: {e}")
|
||
|
|
return 1
|
||
|
|
else:
|
||
|
|
results = ingest_urls(urls, category=args.category, source=args.source, delay=args.delay)
|
||
|
|
for r in results:
|
||
|
|
status = r.get('status', 'unknown').upper()
|
||
|
|
title = r.get('title', r.get('url', 'Unknown'))
|
||
|
|
print(f" {status}: {title}")
|
||
|
|
|
||
|
|
succeeded = sum(1 for r in results if r['status'] not in ('failed', 'duplicate'))
|
||
|
|
dupes = sum(1 for r in results if r.get('status') == 'duplicate')
|
||
|
|
failed = sum(1 for r in results if r.get('status') == 'failed')
|
||
|
|
print(f"\nTotal: {succeeded} new, {dupes} duplicates, {failed} failed")
|
||
|
|
|
||
|
|
# Optional: run enrichment
|
||
|
|
if args.enrich or args.process:
|
||
|
|
print("\nRunning enrichment on extracted content...")
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
enriched = run_enrichment()
|
||
|
|
print(f" Enriched: {enriched}")
|
||
|
|
|
||
|
|
# Optional: run embedding too
|
||
|
|
if args.process:
|
||
|
|
print("\nRunning embedding...")
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
embedded = run_embedding()
|
||
|
|
print(f" Embedded: {embedded}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_crawl(args):
|
||
|
|
from lib.crawler import crawl_site
|
||
|
|
|
||
|
|
print(f"Crawling {args.url}...")
|
||
|
|
if args.include:
|
||
|
|
print(f" Include paths: {args.include}")
|
||
|
|
if args.exclude:
|
||
|
|
print(f" Exclude paths: {args.exclude}")
|
||
|
|
if args.dry_run:
|
||
|
|
print(f" DRY RUN — no content will be ingested")
|
||
|
|
print()
|
||
|
|
|
||
|
|
result = crawl_site(
|
||
|
|
base_url=args.url,
|
||
|
|
category=args.category,
|
||
|
|
source=args.source,
|
||
|
|
include=args.include,
|
||
|
|
exclude=args.exclude,
|
||
|
|
max_pages=args.max_pages,
|
||
|
|
max_depth=args.max_depth,
|
||
|
|
delay=args.delay,
|
||
|
|
dry_run=args.dry_run,
|
||
|
|
use_sitemap=not args.no_sitemap,
|
||
|
|
)
|
||
|
|
|
||
|
|
method = result.get('discovery_method', 'none')
|
||
|
|
print(f"Discovery method: {method}")
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
urls = result.get('urls', [])
|
||
|
|
print(f"Found {len(urls)} URLs that would be ingested:\n")
|
||
|
|
for i, url in enumerate(urls, 1):
|
||
|
|
print(f" {i:4d}. {url}")
|
||
|
|
print(f"\nTotal: {len(urls)} pages")
|
||
|
|
print(f"Re-run without --dry-run to ingest.")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
summary = result.get('summary', {})
|
||
|
|
print(f"\nResults:")
|
||
|
|
print(f" New: {summary.get('succeeded', 0)}")
|
||
|
|
print(f" Duplicates: {summary.get('duplicates', 0)}")
|
||
|
|
print(f" Failed: {summary.get('failed', 0)}")
|
||
|
|
print(f" Total: {summary.get('total', 0)}")
|
||
|
|
|
||
|
|
failed_results = [r for r in result.get('results', []) if r.get('status') == 'failed']
|
||
|
|
if failed_results:
|
||
|
|
print(f"\nFailed URLs:")
|
||
|
|
for r in failed_results[:10]:
|
||
|
|
print(f" {r['url']}: {r.get('error', 'Unknown error')}")
|
||
|
|
if len(failed_results) > 10:
|
||
|
|
print(f" ... and {len(failed_results) - 10} more")
|
||
|
|
|
||
|
|
if args.enrich or args.process:
|
||
|
|
print("\nRunning enrichment...")
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
enriched = run_enrichment()
|
||
|
|
print(f" Enriched: {enriched}")
|
||
|
|
|
||
|
|
if args.process:
|
||
|
|
print("\nRunning embedding...")
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
embedded = run_embedding()
|
||
|
|
print(f" Embedded: {embedded}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_validate(args):
|
||
|
|
from scripts.validate import run_validation
|
||
|
|
run_validation(deep=args.deep)
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_rebuild(args):
|
||
|
|
from scripts.rebuild_qdrant import run_rebuild
|
||
|
|
run_rebuild()
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_serve(args):
|
||
|
|
from lib.api import run_server
|
||
|
|
run_server()
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_service(args):
|
||
|
|
"""Run RECON as a long-lived service. Called by systemd.
|
||
|
|
|
||
|
|
Bundles: Flask dashboard + pipeline stages + library scanner + PeerTube scanner + progress reporter.
|
||
|
|
All threads are daemon threads; SIGTERM/SIGINT trigger graceful shutdown.
|
||
|
|
"""
|
||
|
|
from lib.extractor import run_extraction
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
from lib.api import app, run_server as start_dashboard
|
||
|
|
from lib.peertube_scraper import ingest_all as pt_ingest_all
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
proc = config.get('processing', {})
|
||
|
|
svc = config.get('service', {})
|
||
|
|
|
||
|
|
extract_workers = proc.get('extract_workers', 4)
|
||
|
|
enrich_workers = proc.get('enrich_workers', 16)
|
||
|
|
embed_workers = proc.get('embed_workers', 4)
|
||
|
|
poll_interval = svc.get('stage_poll_interval', 30)
|
||
|
|
scan_interval = svc.get('scan_interval', 3600)
|
||
|
|
progress_interval = svc.get('progress_interval', 60)
|
||
|
|
web_host = config.get('web', {}).get('host', '0.0.0.0')
|
||
|
|
web_port = config.get('web', {}).get('port', 8420)
|
||
|
|
|
||
|
|
stop_event = threading.Event()
|
||
|
|
totals = {'extract': 0, 'enrich': 0, 'embed': 0, 'scan': 0}
|
||
|
|
|
||
|
|
def shutdown(signum, frame):
|
||
|
|
sig_name = signal.Signals(signum).name
|
||
|
|
logger.info(f"Received {sig_name}, shutting down gracefully...")
|
||
|
|
stop_event.set()
|
||
|
|
|
||
|
|
signal.signal(signal.SIGTERM, shutdown)
|
||
|
|
signal.signal(signal.SIGINT, shutdown)
|
||
|
|
|
||
|
|
def stage_loop(name, process_fn):
|
||
|
|
"""Run a pipeline stage: process batch, sleep, repeat forever."""
|
||
|
|
logger.info(f"[{name}] Stage started")
|
||
|
|
while not stop_event.is_set():
|
||
|
|
try:
|
||
|
|
processed = process_fn()
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[{name}] Error: {e}", exc_info=True)
|
||
|
|
processed = 0
|
||
|
|
if processed and processed > 0:
|
||
|
|
totals[name] += processed
|
||
|
|
continue
|
||
|
|
stop_event.wait(poll_interval)
|
||
|
|
logger.info(f"[{name}] Stage stopped (total: {totals[name]})")
|
||
|
|
|
||
|
|
def scanner_loop():
|
||
|
|
"""Periodically scan library and queue new documents."""
|
||
|
|
logger.info(f"[scanner] Started (interval: {scan_interval}s)")
|
||
|
|
# Run initial scan immediately
|
||
|
|
try:
|
||
|
|
new_cat = scan_library()
|
||
|
|
new_queued = queue_all()
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
if new_cat or new_queued or synced:
|
||
|
|
logger.info(f"[scanner] Initial: {new_cat} catalogued, {new_queued} queued, {synced} paths synced")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[scanner] Initial scan error: {e}", exc_info=True)
|
||
|
|
|
||
|
|
while not stop_event.is_set():
|
||
|
|
stop_event.wait(scan_interval)
|
||
|
|
if stop_event.is_set():
|
||
|
|
break
|
||
|
|
try:
|
||
|
|
new_cat = scan_library()
|
||
|
|
new_queued = queue_all()
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
totals['scan'] += new_queued
|
||
|
|
if new_cat or new_queued or synced:
|
||
|
|
logger.info(f"[scanner] {new_cat} catalogued, {new_queued} queued, {synced} paths synced")
|
||
|
|
else:
|
||
|
|
logger.debug("[scanner] No new documents")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[scanner] Error: {e}", exc_info=True)
|
||
|
|
logger.info("[scanner] Stopped")
|
||
|
|
|
||
|
|
def progress_loop():
|
||
|
|
"""Log pipeline status periodically."""
|
||
|
|
while not stop_event.is_set():
|
||
|
|
stop_event.wait(progress_interval)
|
||
|
|
if stop_event.is_set():
|
||
|
|
break
|
||
|
|
try:
|
||
|
|
db = StatusDB()
|
||
|
|
raw = db.get_status_counts()
|
||
|
|
c = raw.get('documents', {})
|
||
|
|
logger.info(
|
||
|
|
f"[pipeline] queued={c.get('queued', 0)} "
|
||
|
|
f"extracting={c.get('extracting', 0)} "
|
||
|
|
f"extracted={c.get('extracted', 0)} "
|
||
|
|
f"enriching={c.get('enriching', 0)} "
|
||
|
|
f"enriched={c.get('enriched', 0)} "
|
||
|
|
f"embedding={c.get('embedding', 0)} "
|
||
|
|
f"complete={c.get('complete', 0)} "
|
||
|
|
f"failed={c.get('failed', 0)}"
|
||
|
|
)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
def peertube_scanner_loop():
|
||
|
|
"""Periodically ingest new PeerTube video transcripts."""
|
||
|
|
from lib.peertube_scraper import set_stop_check
|
||
|
|
set_stop_check(stop_event.is_set)
|
||
|
|
logger.info(f"[peertube] Scanner started (interval: {scan_interval}s)")
|
||
|
|
try:
|
||
|
|
result = pt_ingest_all(config=config)
|
||
|
|
ingested = result.get('ingested', 0) if isinstance(result, dict) else 0
|
||
|
|
if ingested:
|
||
|
|
logger.info(f"[peertube] Initial scan: {ingested} transcripts ingested")
|
||
|
|
else:
|
||
|
|
logger.info("[peertube] Initial scan: no new transcripts")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[peertube] Initial scan error: {e}", exc_info=True)
|
||
|
|
|
||
|
|
while not stop_event.is_set():
|
||
|
|
stop_event.wait(scan_interval)
|
||
|
|
if stop_event.is_set():
|
||
|
|
break
|
||
|
|
try:
|
||
|
|
result = pt_ingest_all(config=config)
|
||
|
|
ingested = result.get('ingested', 0) if isinstance(result, dict) else 0
|
||
|
|
if ingested:
|
||
|
|
logger.info(f"[peertube] {ingested} new transcripts ingested")
|
||
|
|
else:
|
||
|
|
logger.debug("[peertube] No new transcripts")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[peertube] Scan error: {e}", exc_info=True)
|
||
|
|
logger.info("[peertube] Scanner stopped")
|
||
|
|
|
||
|
|
def crawler_scheduler_loop():
|
||
|
|
"""Scheduled site crawler — crawls configured sites, one per cycle.
|
||
|
|
|
||
|
|
- Reads sites from config.yaml crawler.sites list
|
||
|
|
- Processes sites in tier order (lower tier = higher priority)
|
||
|
|
- Tracks last-crawled timestamp per site in crawl_state.json
|
||
|
|
- Skips sites crawled within recrawl_interval_days (default 7)
|
||
|
|
- Waits inter_site_cooldown seconds between sites (default 30)
|
||
|
|
- Uses per-site delay for rate limiting during crawl
|
||
|
|
"""
|
||
|
|
import json as _json
|
||
|
|
from lib.crawler import crawl_site
|
||
|
|
|
||
|
|
crawler_cfg = config.get('crawler', {})
|
||
|
|
sites = crawler_cfg.get('sites', [])
|
||
|
|
if not sites:
|
||
|
|
logger.info("[crawler] No sites configured, scheduler disabled")
|
||
|
|
return
|
||
|
|
|
||
|
|
recrawl_days = crawler_cfg.get('recrawl_interval_days', 7)
|
||
|
|
cooldown = crawler_cfg.get('inter_site_cooldown', 30)
|
||
|
|
state_file = os.path.join(config.get('paths', {}).get('data', '/opt/recon/data'), 'crawl_state.json')
|
||
|
|
|
||
|
|
# Load persisted state
|
||
|
|
def load_state():
|
||
|
|
try:
|
||
|
|
with open(state_file, 'r') as sf:
|
||
|
|
return _json.load(sf)
|
||
|
|
except (FileNotFoundError, _json.JSONDecodeError):
|
||
|
|
return {}
|
||
|
|
|
||
|
|
def save_state(state):
|
||
|
|
try:
|
||
|
|
with open(state_file, 'w') as sf:
|
||
|
|
_json.dump(state, sf, indent=2)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[crawler] Failed to save state: {e}")
|
||
|
|
|
||
|
|
# Sort by tier (lower = higher priority)
|
||
|
|
sorted_sites = sorted(sites, key=lambda s: s.get('tier', 99))
|
||
|
|
logger.info(f"[crawler] Scheduler started — {len(sorted_sites)} sites configured, "
|
||
|
|
f"recrawl every {recrawl_days}d, {cooldown}s cooldown between sites")
|
||
|
|
|
||
|
|
# Initial delay — let the main pipeline stabilize before crawling
|
||
|
|
stop_event.wait(60)
|
||
|
|
if stop_event.is_set():
|
||
|
|
return
|
||
|
|
|
||
|
|
while not stop_event.is_set():
|
||
|
|
state = load_state()
|
||
|
|
now = time.time()
|
||
|
|
crawled_this_cycle = 0
|
||
|
|
|
||
|
|
for site in sorted_sites:
|
||
|
|
if stop_event.is_set():
|
||
|
|
break
|
||
|
|
|
||
|
|
url = site.get('url', '')
|
||
|
|
if not url:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Check recrawl interval
|
||
|
|
last_crawled = state.get(url, {}).get('last_crawled', 0)
|
||
|
|
age_days = (now - last_crawled) / 86400
|
||
|
|
if age_days < recrawl_days:
|
||
|
|
logger.debug(f"[crawler] Skipping {url} — crawled {age_days:.1f}d ago")
|
||
|
|
continue
|
||
|
|
|
||
|
|
category = site.get('category', 'Web')
|
||
|
|
max_depth = site.get('max_depth', crawler_cfg.get('max_depth', 3))
|
||
|
|
max_pages = site.get('max_pages', crawler_cfg.get('max_pages', 500))
|
||
|
|
delay = site.get('delay', crawler_cfg.get('rate_limit_delay', 1.0))
|
||
|
|
tier = site.get('tier', 99)
|
||
|
|
notes = site.get('notes', '')
|
||
|
|
|
||
|
|
logger.info(f"[crawler] Starting: {url} (tier {tier}, category={category}, "
|
||
|
|
f"depth={max_depth}, pages={max_pages}, delay={delay}s)")
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = crawl_site(
|
||
|
|
base_url=url,
|
||
|
|
category=category,
|
||
|
|
max_pages=max_pages,
|
||
|
|
max_depth=max_depth,
|
||
|
|
delay=delay,
|
||
|
|
config=config,
|
||
|
|
)
|
||
|
|
|
||
|
|
summary = result.get('summary', {})
|
||
|
|
method = result.get('discovery_method', 'none')
|
||
|
|
logger.info(f"[crawler] Done: {url} via {method} — "
|
||
|
|
f"{summary.get('succeeded', 0)} new, "
|
||
|
|
f"{summary.get('duplicates', 0)} dupes, "
|
||
|
|
f"{summary.get('failed', 0)} failed")
|
||
|
|
|
||
|
|
# Update state
|
||
|
|
state[url] = {
|
||
|
|
'last_crawled': time.time(),
|
||
|
|
'method': method,
|
||
|
|
'succeeded': summary.get('succeeded', 0),
|
||
|
|
'duplicates': summary.get('duplicates', 0),
|
||
|
|
'failed': summary.get('failed', 0),
|
||
|
|
'tier': tier,
|
||
|
|
'category': category,
|
||
|
|
}
|
||
|
|
save_state(state)
|
||
|
|
crawled_this_cycle += 1
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[crawler] Failed: {url} — {e}", exc_info=True)
|
||
|
|
state[url] = {
|
||
|
|
'last_crawled': time.time(),
|
||
|
|
'error': str(e),
|
||
|
|
'tier': tier,
|
||
|
|
'category': category,
|
||
|
|
}
|
||
|
|
save_state(state)
|
||
|
|
|
||
|
|
# Inter-site cooldown
|
||
|
|
if not stop_event.is_set():
|
||
|
|
logger.debug(f"[crawler] Cooling down {cooldown}s before next site...")
|
||
|
|
stop_event.wait(cooldown)
|
||
|
|
|
||
|
|
if crawled_this_cycle:
|
||
|
|
logger.info(f"[crawler] Cycle complete — {crawled_this_cycle} sites crawled")
|
||
|
|
else:
|
||
|
|
logger.debug("[crawler] Cycle complete — all sites within recrawl window")
|
||
|
|
|
||
|
|
# Sleep until next cycle check (1 hour)
|
||
|
|
if not stop_event.is_set():
|
||
|
|
stop_event.wait(3600)
|
||
|
|
|
||
|
|
logger.info("[crawler] Scheduler stopped")
|
||
|
|
|
||
|
|
def organizer_loop():
|
||
|
|
"""Organize completed documents into Domain/Subdomain folders."""
|
||
|
|
from lib.organizer import organize_document
|
||
|
|
logger.info(f"[organizer] Started (interval: {poll_interval}s)")
|
||
|
|
# Initial delay — let pipeline process some docs first
|
||
|
|
stop_event.wait(60)
|
||
|
|
|
||
|
|
while not stop_event.is_set():
|
||
|
|
try:
|
||
|
|
org_db = StatusDB()
|
||
|
|
unorganized = org_db.get_unorganized(limit=100)
|
||
|
|
if unorganized:
|
||
|
|
moved = 0
|
||
|
|
errors = 0
|
||
|
|
for doc in unorganized:
|
||
|
|
if stop_event.is_set():
|
||
|
|
break
|
||
|
|
result = organize_document(doc['hash'], org_db, config)
|
||
|
|
if result['action'] == 'moved':
|
||
|
|
moved += 1
|
||
|
|
elif result['action'] == 'error':
|
||
|
|
errors += 1
|
||
|
|
if moved or errors:
|
||
|
|
logger.info(f"[organizer] Organized {moved} documents ({errors} errors)")
|
||
|
|
# Sync paths to Qdrant after batch
|
||
|
|
if moved > 0:
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
if synced:
|
||
|
|
logger.info(f"[organizer] Synced {synced} paths to Qdrant")
|
||
|
|
else:
|
||
|
|
logger.debug("[organizer] No unorganized documents")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"[organizer] Error: {e}", exc_info=True)
|
||
|
|
stop_event.wait(poll_interval)
|
||
|
|
logger.info("[organizer] Stopped")
|
||
|
|
|
||
|
|
threads = [
|
||
|
|
threading.Thread(target=stage_loop, daemon=True, name='extract',
|
||
|
|
args=('extract', lambda: run_extraction(workers=extract_workers))),
|
||
|
|
threading.Thread(target=stage_loop, daemon=True, name='enrich',
|
||
|
|
args=('enrich', lambda: run_enrichment(workers=enrich_workers))),
|
||
|
|
threading.Thread(target=stage_loop, daemon=True, name='embed',
|
||
|
|
args=('embed', lambda: run_embedding(workers=embed_workers))),
|
||
|
|
threading.Thread(target=scanner_loop, daemon=True, name='scanner'),
|
||
|
|
threading.Thread(target=peertube_scanner_loop, daemon=True, name='peertube'),
|
||
|
|
threading.Thread(target=crawler_scheduler_loop, daemon=True, name='crawler'),
|
||
|
|
threading.Thread(target=organizer_loop, daemon=True, name='organizer'),
|
||
|
|
threading.Thread(target=progress_loop, daemon=True, name='progress'),
|
||
|
|
threading.Thread(target=lambda: start_dashboard(stop_event),
|
||
|
|
daemon=True, name='dashboard'),
|
||
|
|
]
|
||
|
|
|
||
|
|
logger.info("=== RECON Service Starting ===")
|
||
|
|
logger.info(f" Dashboard: {web_host}:{web_port}")
|
||
|
|
logger.info(f" Workers: extract={extract_workers}, enrich={enrich_workers}, embed={embed_workers}")
|
||
|
|
logger.info(f" Scanner: every {scan_interval}s | Progress: every {progress_interval}s")
|
||
|
|
crawler_sites = config.get('crawler', {}).get('sites', [])
|
||
|
|
logger.info(f" Crawler: {len(crawler_sites)} sites configured")
|
||
|
|
|
||
|
|
for t in threads:
|
||
|
|
t.start()
|
||
|
|
|
||
|
|
# Start metrics collector for time-series charts
|
||
|
|
try:
|
||
|
|
from lib.peertube_collector import start_collector
|
||
|
|
start_collector(stop_event)
|
||
|
|
logger.info(" Metrics collector started")
|
||
|
|
except Exception as e:
|
||
|
|
logger.warning(f"Metrics collector failed to start: {e}")
|
||
|
|
|
||
|
|
logger.info("=== RECON Service Ready ===")
|
||
|
|
|
||
|
|
# Block main thread until shutdown signal
|
||
|
|
try:
|
||
|
|
while not stop_event.is_set():
|
||
|
|
stop_event.wait(1)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
stop_event.set()
|
||
|
|
|
||
|
|
# Give threads a moment to finish current batch
|
||
|
|
logger.info("Waiting for threads to finish...")
|
||
|
|
time.sleep(5)
|
||
|
|
logger.info("=== RECON Service Stopped ===")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_ingest_peertube(args):
|
||
|
|
from lib.peertube_scraper import ingest_channel, ingest_all, get_instance_stats
|
||
|
|
|
||
|
|
if args.stats:
|
||
|
|
stats = get_instance_stats()
|
||
|
|
print("=== PeerTube Instance Stats ===")
|
||
|
|
print(f" Total videos: {stats['total_videos']}")
|
||
|
|
print(f" Ingested into RECON: {stats['ingested']}")
|
||
|
|
if stats.get('status_breakdown'):
|
||
|
|
print(" Pipeline status:")
|
||
|
|
for status, count in sorted(stats['status_breakdown'].items()):
|
||
|
|
print(f" {status}: {count}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
since = args.since if hasattr(args, 'since') and args.since else None
|
||
|
|
|
||
|
|
if args.channel:
|
||
|
|
print(f"Ingesting PeerTube channel: {args.channel}")
|
||
|
|
result = ingest_channel(args.channel, since=since)
|
||
|
|
else:
|
||
|
|
print("Ingesting all PeerTube videos with captions")
|
||
|
|
result = ingest_all(since=since)
|
||
|
|
|
||
|
|
summary = result.get('summary', {})
|
||
|
|
print(f"\nResults:")
|
||
|
|
print(f" Checked: {summary.get('total_checked', 0)}")
|
||
|
|
print(f" Ingested: {summary.get('ingested', 0)} ({summary.get('total_pages', 0)} pages)")
|
||
|
|
print(f" No captions: {summary.get('skipped_no_captions', 0)}")
|
||
|
|
print(f" Duplicates: {summary.get('skipped_duplicate', 0)}")
|
||
|
|
print(f" Failed: {summary.get('failed', 0)}")
|
||
|
|
|
||
|
|
# Optional: run enrichment
|
||
|
|
if args.enrich or args.process:
|
||
|
|
print("\nRunning enrichment on extracted content...")
|
||
|
|
from lib.enricher import run_enrichment
|
||
|
|
enriched = run_enrichment()
|
||
|
|
print(f" Enriched: {enriched}")
|
||
|
|
|
||
|
|
# Optional: run embedding too
|
||
|
|
if args.process:
|
||
|
|
print("\nRunning embedding...")
|
||
|
|
from lib.embedder import run_embedding
|
||
|
|
embedded = run_embedding()
|
||
|
|
print(f" Embedded: {embedded}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_organize(args):
|
||
|
|
"""Organize completed documents into Domain/Subdomain folders."""
|
||
|
|
from lib.organizer import organize_document, organize_from_manifest
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
db = StatusDB()
|
||
|
|
dry_run = args.dry_run
|
||
|
|
|
||
|
|
if args.manifest:
|
||
|
|
# Bulk migration from manifest
|
||
|
|
print(f"Organizing from manifest: {args.manifest}")
|
||
|
|
if dry_run:
|
||
|
|
print("[DRY RUN] No files will be moved")
|
||
|
|
stats = organize_from_manifest(args.manifest, db, config, dry_run=dry_run)
|
||
|
|
print(f"\nManifest results:")
|
||
|
|
print(f" Total entries: {stats['total']}")
|
||
|
|
print(f" Moved: {stats['moved']}")
|
||
|
|
print(f" Already organized: {stats['already_organized']}")
|
||
|
|
print(f" Skipped (ambig): {stats['skipped']}")
|
||
|
|
print(f" Not found on disk: {stats['not_found']}")
|
||
|
|
print(f" Errors: {stats['errors']}")
|
||
|
|
|
||
|
|
if not dry_run and stats['moved'] > 0:
|
||
|
|
print("\nSyncing paths to Qdrant...")
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
print(f" Synced {synced} document paths")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
# Single hash or batch of unorganized docs
|
||
|
|
if args.hash:
|
||
|
|
hashes = [args.hash]
|
||
|
|
else:
|
||
|
|
docs = db.get_unorganized(limit=args.limit)
|
||
|
|
hashes = [d['hash'] for d in docs]
|
||
|
|
if not hashes:
|
||
|
|
print("No unorganized documents found")
|
||
|
|
return 0
|
||
|
|
print(f"Found {len(hashes)} unorganized documents")
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
print("[DRY RUN] No files will be moved\n")
|
||
|
|
|
||
|
|
moved = 0
|
||
|
|
skipped = 0
|
||
|
|
errors = 0
|
||
|
|
|
||
|
|
for doc_hash in hashes:
|
||
|
|
result = organize_document(doc_hash, db, config, dry_run=dry_run)
|
||
|
|
action = result['action']
|
||
|
|
|
||
|
|
if action == 'moved' or action == 'would_move':
|
||
|
|
moved += 1
|
||
|
|
if dry_run or args.verbose:
|
||
|
|
print(f" {'WOULD MOVE' if dry_run else 'MOVED'}: {result['before_path']}")
|
||
|
|
print(f" -> {result['after_path']}")
|
||
|
|
print(f" [{result['domain']}/{result['subdomain']}]")
|
||
|
|
elif action == 'already_organized':
|
||
|
|
skipped += 1
|
||
|
|
if args.verbose:
|
||
|
|
print(f" SKIP (already organized): {result['before_path']}")
|
||
|
|
elif action == 'skip_unclassified':
|
||
|
|
skipped += 1
|
||
|
|
if args.verbose:
|
||
|
|
print(f" SKIP (unclassified): {result['before_path']}")
|
||
|
|
elif action == 'error':
|
||
|
|
errors += 1
|
||
|
|
print(f" ERROR: {result.get('before_path', doc_hash[:8])}: {result['error']}")
|
||
|
|
else:
|
||
|
|
skipped += 1
|
||
|
|
|
||
|
|
print(f"\nSummary: {moved} {'would move' if dry_run else 'moved'}, {skipped} skipped, {errors} errors")
|
||
|
|
|
||
|
|
if not dry_run and moved > 0:
|
||
|
|
print("\nSyncing paths to Qdrant...")
|
||
|
|
synced = sync_qdrant_paths()
|
||
|
|
print(f" Synced {synced} document paths")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_ingest(args):
|
||
|
|
from lib.ingester import ingest_file, run_ingestion
|
||
|
|
if args.file:
|
||
|
|
results = ingest_file(args.file)
|
||
|
|
success = sum(1 for r in results if r is not None)
|
||
|
|
print(f"Ingested {success}/{len(results)} items from {args.file}")
|
||
|
|
else:
|
||
|
|
total = run_ingestion(directory=args.directory)
|
||
|
|
print(f"Ingested {total} intel items")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
def cmd_pipeline(args):
|
||
|
|
"""Stream B library pipeline: status, migrate, reverse, watch, sweep."""
|
||
|
|
from lib.new_pipeline import (
|
||
|
|
migrate_domain, migrate_civil_org, reverse_operation, run_watchdog, update_qdrant_payload,
|
||
|
|
compute_sweep_plan, execute_sweep_plan, verify_sweep, _save_sweep_plan,
|
||
|
|
)
|
||
|
|
|
||
|
|
config = get_config()
|
||
|
|
db = StatusDB()
|
||
|
|
|
||
|
|
if args.pipeline_action == 'status':
|
||
|
|
stats = db.get_pipeline_stats()
|
||
|
|
print("=== Stream B Pipeline Status ===\n")
|
||
|
|
print("File Operations:")
|
||
|
|
for op, cnt in stats.get('operations', {}).items():
|
||
|
|
print(f" {op}: {cnt}")
|
||
|
|
if not stats.get('operations'):
|
||
|
|
print(" (none)")
|
||
|
|
print("\nDuplicate Review Queue:")
|
||
|
|
for status, cnt in stats.get('duplicates', {}).items():
|
||
|
|
print(f" {status}: {cnt}")
|
||
|
|
if not stats.get('duplicates'):
|
||
|
|
print(" (none)")
|
||
|
|
print(f"\nAcquired pending: {stats.get('acquired_pending', 0)}")
|
||
|
|
print(f"Ingest pending: {stats.get('ingest_pending', 0)}")
|
||
|
|
|
||
|
|
# Recent operations
|
||
|
|
recent = db.get_file_operations(limit=10)
|
||
|
|
if recent:
|
||
|
|
print("\nRecent operations:")
|
||
|
|
for op in recent:
|
||
|
|
print(f" [{op['id']}] {op['operation']} {op['source_filename']} -> {op['target_filename']} "
|
||
|
|
f"(step {op['collision_step']}, {op['qdrant_points_updated']} vectors) "
|
||
|
|
f"at {op['performed_at']}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
elif args.pipeline_action == 'migrate':
|
||
|
|
dry_run = args.dry_run
|
||
|
|
domain = getattr(args, 'domain', None) or 'Civil Organization'
|
||
|
|
if dry_run:
|
||
|
|
print(f"[DRY RUN] {domain} — No files will be moved\n")
|
||
|
|
else:
|
||
|
|
print(f"=== {domain} Migration ===\n")
|
||
|
|
|
||
|
|
stats = migrate_domain(domain, db, config, dry_run=dry_run)
|
||
|
|
|
||
|
|
print(f"\nResults:")
|
||
|
|
print(f" Total PDFs found: {stats['total']}")
|
||
|
|
print(f" Moved: {stats['moved']}")
|
||
|
|
print(f" Renamed: {stats['renamed']}")
|
||
|
|
print(f" Already correct: {stats['already_correct']}")
|
||
|
|
print(f" Skipped: {stats['skipped']}")
|
||
|
|
print(f" Not catalogued: {stats['not_catalogued']}")
|
||
|
|
print(f" No book_title: {stats['no_book_title']}")
|
||
|
|
print(f" Domain mismatch: {stats['domain_mismatch']}")
|
||
|
|
print(f" Duplicates: {stats['duplicates']}")
|
||
|
|
print(f" Failed: {stats['failed']}")
|
||
|
|
|
||
|
|
if stats.get('errors'):
|
||
|
|
print(f"\nErrors:")
|
||
|
|
for err in stats['errors'][:20]:
|
||
|
|
print(f" {err}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
elif args.pipeline_action == 'reverse':
|
||
|
|
if not args.operation_id:
|
||
|
|
print("Error: --id required for reverse")
|
||
|
|
return 1
|
||
|
|
op_id = int(args.operation_id)
|
||
|
|
if reverse_operation(op_id, db, config):
|
||
|
|
print(f"Reversed operation {op_id}")
|
||
|
|
else:
|
||
|
|
print(f"Failed to reverse operation {op_id}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
elif args.pipeline_action == 'watch':
|
||
|
|
print("Starting pipeline watchdog (Ctrl+C to stop)...")
|
||
|
|
run_watchdog(config)
|
||
|
|
return 0
|
||
|
|
|
||
|
|
elif args.pipeline_action == 'sweep':
|
||
|
|
data_dir = config.get('paths', {}).get('data', '/opt/recon/data')
|
||
|
|
output_dir = os.path.join(data_dir, 'sweep')
|
||
|
|
|
||
|
|
if args.verify:
|
||
|
|
print("=== Sweep Verification ===\n")
|
||
|
|
ok, issues, domain_counts = verify_sweep(db, config)
|
||
|
|
print(f"Verified operations: {ok} OK, {len(issues)} issues\n")
|
||
|
|
if issues:
|
||
|
|
print("Issues:")
|
||
|
|
for iss in issues[:50]:
|
||
|
|
print(f" [{iss['type']}] {iss['doc_hash'][:8]}: {iss.get('target_path', iss.get('source_path', '?'))}")
|
||
|
|
if len(issues) > 50:
|
||
|
|
print(f" ... and {len(issues) - 50} more")
|
||
|
|
print("\nDomain distribution (post-sweep):")
|
||
|
|
for dom, cnt in sorted(domain_counts.items(), key=lambda x: -x[1]):
|
||
|
|
print(f" {dom:<35s} {cnt:>6d}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
if args.execute or args.resume:
|
||
|
|
plan_file = args.plan_file or os.path.join(output_dir, 'sweep_plan.json')
|
||
|
|
if not os.path.exists(plan_file):
|
||
|
|
print(f"Error: No sweep plan found at {plan_file}")
|
||
|
|
print("Run: recon.py pipeline sweep --dry-run first")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
if args.plan_file:
|
||
|
|
checkpoint_file = args.plan_file.replace('.json', '_checkpoint.json') if args.resume else None
|
||
|
|
else:
|
||
|
|
checkpoint_file = os.path.join(output_dir, 'sweep_checkpoint.json') if args.resume else None
|
||
|
|
|
||
|
|
print(f"=== Executing Sweep ===")
|
||
|
|
print(f"Plan: {plan_file}")
|
||
|
|
if checkpoint_file:
|
||
|
|
print(f"Resuming from checkpoint: {checkpoint_file}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
stats = execute_sweep_plan(db, config, plan_file,
|
||
|
|
batch_size=args.batch_size or 500,
|
||
|
|
max_entries=args.max_entries,
|
||
|
|
checkpoint_file=checkpoint_file)
|
||
|
|
|
||
|
|
print(f"\nSweep Results:")
|
||
|
|
print(f" Total entries: {stats['total']}")
|
||
|
|
print(f" Relocated: {stats['relocated']}")
|
||
|
|
print(f" Rescued: {stats['rescued']}")
|
||
|
|
print(f" Unclassified moved: {stats['unclassified_moved']}")
|
||
|
|
print(f" No-op (marked): {stats['no_op_marked']}")
|
||
|
|
print(f" Duplicates: {stats['duplicates']}")
|
||
|
|
print(f" Skipped: {stats['skipped']}")
|
||
|
|
print(f" Failed: {stats['failed']}")
|
||
|
|
print(f" Qdrant updated: {stats['qdrant_updated']}")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
# Default: dry-run
|
||
|
|
print("=== Sweep Dry Run ===\n")
|
||
|
|
print("Computing plan (this may take several minutes)...\n")
|
||
|
|
plan, stats = compute_sweep_plan(db, config)
|
||
|
|
|
||
|
|
plan_file, summary_file = _save_sweep_plan(plan, stats, output_dir)
|
||
|
|
|
||
|
|
print(f"Plan Summary:")
|
||
|
|
print(f" Total files scanned: {stats['total_files']}")
|
||
|
|
print(f" Relocate: {stats['relocate']}")
|
||
|
|
print(f" Rescue (uncataloged): {stats['rescue']}")
|
||
|
|
print(f" Unclassified: {stats['unclassified']}")
|
||
|
|
print(f" No-op (correct): {stats['no_op']}")
|
||
|
|
print(f" Skip in-progress: {stats['skip_in_progress']}")
|
||
|
|
print(f" Skip failed: {stats['skip_failed']}")
|
||
|
|
print(f" Skip garbage title: {stats['skip_garbage']}")
|
||
|
|
print(f" Skip other: {stats['skip_other']}")
|
||
|
|
print(f" Errors: {stats['errors']}")
|
||
|
|
print()
|
||
|
|
print(f"Collision steps:")
|
||
|
|
for step, cnt in sorted(stats['collision_steps'].items()):
|
||
|
|
labels = {1: 'Title.pdf', 2: 'Title_Author.pdf', 3: 'Title_Author_Year.pdf', 4: 'duplicate_review'}
|
||
|
|
print(f" Step {step} ({labels.get(step, '?')}): {cnt}")
|
||
|
|
print()
|
||
|
|
print(f"Plan saved: {plan_file}")
|
||
|
|
print(f"Summary saved: {summary_file}")
|
||
|
|
print()
|
||
|
|
print("Review the plan, then execute with:")
|
||
|
|
print(f" recon.py pipeline sweep --execute")
|
||
|
|
return 0
|
||
|
|
|
||
|
|
else:
|
||
|
|
print("Usage: recon.py pipeline {status|migrate|reverse|watch|sweep}")
|
||
|
|
return 1
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description='RECON — Knowledge Base Management System',
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
||
|
|
)
|
||
|
|
sub = parser.add_subparsers(dest='command', help='Available commands')
|
||
|
|
|
||
|
|
# scan
|
||
|
|
p = sub.add_parser('scan', help='Scan library and catalogue PDFs')
|
||
|
|
p.add_argument('--path', help='Specific path to scan (default: library_root)')
|
||
|
|
p.set_defaults(func=cmd_scan)
|
||
|
|
|
||
|
|
# queue
|
||
|
|
p = sub.add_parser('queue', help='Queue catalogued documents for processing')
|
||
|
|
p.add_argument('--hash', help='Queue a specific document by hash')
|
||
|
|
p.add_argument('--source', help='Filter by source')
|
||
|
|
p.add_argument('--category', help='Filter by category')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit number queued')
|
||
|
|
p.set_defaults(func=cmd_queue)
|
||
|
|
|
||
|
|
# extract
|
||
|
|
p = sub.add_parser('extract', help='Extract text from queued PDFs')
|
||
|
|
p.add_argument('--workers', type=int, help='Number of workers')
|
||
|
|
p.set_defaults(func=cmd_extract)
|
||
|
|
|
||
|
|
# enrich
|
||
|
|
p = sub.add_parser('enrich', help='Enrich extracted text with Gemini')
|
||
|
|
p.add_argument('--workers', type=int, help='Number of workers')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit number enriched')
|
||
|
|
p.set_defaults(func=cmd_enrich)
|
||
|
|
|
||
|
|
# embed
|
||
|
|
p = sub.add_parser('embed', help='Embed concepts into Qdrant')
|
||
|
|
p.add_argument('--workers', type=int, help='Number of workers')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit number embedded')
|
||
|
|
p.set_defaults(func=cmd_embed)
|
||
|
|
|
||
|
|
# run
|
||
|
|
p = sub.add_parser('run', help='Run full pipeline (extract -> enrich -> embed)')
|
||
|
|
p.add_argument('--workers', type=int, default=4, help='Number of workers')
|
||
|
|
p.add_argument('--enrich-workers', type=int, help='Override enrich worker count')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit documents per stage')
|
||
|
|
p.set_defaults(func=cmd_run)
|
||
|
|
|
||
|
|
# status
|
||
|
|
p = sub.add_parser('status', help='Show pipeline status')
|
||
|
|
p.set_defaults(func=cmd_status)
|
||
|
|
|
||
|
|
# catalogue
|
||
|
|
p = sub.add_parser('catalogue', help='Browse catalogue')
|
||
|
|
p.add_argument('--sources', action='store_true', help='Show source breakdown')
|
||
|
|
p.add_argument('--categories', action='store_true', help='Show category breakdown')
|
||
|
|
p.add_argument('--source', help='Filter by source')
|
||
|
|
p.add_argument('--category', help='Filter by category')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit results')
|
||
|
|
p.set_defaults(func=cmd_catalogue)
|
||
|
|
|
||
|
|
# failures
|
||
|
|
p = sub.add_parser('failures', help='Show failed documents')
|
||
|
|
p.add_argument('--retry', action='store_true', help='Re-queue all failures')
|
||
|
|
p.set_defaults(func=cmd_failures)
|
||
|
|
|
||
|
|
# search
|
||
|
|
p = sub.add_parser('search', help='Semantic search the knowledge base')
|
||
|
|
p.add_argument('query', nargs='+', help='Search query')
|
||
|
|
p.add_argument('--limit', type=int, default=10, help='Number of results')
|
||
|
|
p.set_defaults(func=cmd_search)
|
||
|
|
|
||
|
|
# upload
|
||
|
|
p = sub.add_parser('upload', help='Upload PDFs to the knowledge base')
|
||
|
|
p.add_argument('--file', help='Upload a single PDF file')
|
||
|
|
p.add_argument('--dir', help='Upload all PDFs from a directory')
|
||
|
|
p.add_argument('--category', help='Category for uploaded files')
|
||
|
|
p.set_defaults(func=cmd_upload)
|
||
|
|
|
||
|
|
# ingest-url
|
||
|
|
p = sub.add_parser('ingest-url', help='Ingest web content from URLs')
|
||
|
|
p.add_argument('url', nargs='?', help='URL to ingest')
|
||
|
|
p.add_argument('--file', help='File containing URLs (one per line)')
|
||
|
|
p.add_argument('--category', default='Web', help='Category for ingested content')
|
||
|
|
p.add_argument('--source', default='web', help='Source identifier')
|
||
|
|
p.add_argument('--delay', type=float, default=1.0, help='Delay between URL fetches (seconds)')
|
||
|
|
p.add_argument('--enrich', action='store_true', help='Run enrichment after ingestion')
|
||
|
|
p.add_argument('--process', action='store_true', help='Full pipeline: ingest + enrich + embed')
|
||
|
|
p.set_defaults(func=cmd_ingest_url)
|
||
|
|
|
||
|
|
# crawl
|
||
|
|
p = sub.add_parser('crawl', help='Crawl a site and ingest discovered pages')
|
||
|
|
p.add_argument('url', help='Base URL to crawl')
|
||
|
|
p.add_argument('--category', default='Web', help='Category for ingested content')
|
||
|
|
p.add_argument('--source', default=None, help='Source identifier (default: domain name)')
|
||
|
|
p.add_argument('--include', nargs='+', help='Only include URL paths starting with these prefixes')
|
||
|
|
p.add_argument('--exclude', nargs='+', help='Exclude URL paths starting with these prefixes')
|
||
|
|
p.add_argument('--max-pages', type=int, default=500, help='Maximum pages to ingest')
|
||
|
|
p.add_argument('--max-depth', type=int, default=3, help='Maximum link-follow depth')
|
||
|
|
p.add_argument('--delay', type=float, default=1.0, help='Delay between page fetches (seconds)')
|
||
|
|
p.add_argument('--dry-run', action='store_true', help='Discover URLs without ingesting')
|
||
|
|
p.add_argument('--no-sitemap', action='store_true', help='Skip sitemap, use link-following only')
|
||
|
|
p.add_argument('--enrich', action='store_true', help='Run enrichment after crawl')
|
||
|
|
p.add_argument('--process', action='store_true', help='Full pipeline: crawl + enrich + embed')
|
||
|
|
p.set_defaults(func=cmd_crawl)
|
||
|
|
|
||
|
|
# validate
|
||
|
|
p = sub.add_parser('validate', help='Validate pipeline consistency')
|
||
|
|
p.add_argument('--deep', action='store_true', help='Deep validation (check all files)')
|
||
|
|
p.set_defaults(func=cmd_validate)
|
||
|
|
|
||
|
|
# rebuild
|
||
|
|
p = sub.add_parser('rebuild', help='Rebuild Qdrant from concept JSONs')
|
||
|
|
p.set_defaults(func=cmd_rebuild)
|
||
|
|
|
||
|
|
# serve
|
||
|
|
p = sub.add_parser('serve', help='Start web dashboard')
|
||
|
|
p.set_defaults(func=cmd_serve)
|
||
|
|
|
||
|
|
# service
|
||
|
|
p = sub.add_parser('service', help='Run as long-lived service (dashboard + pipeline + scanner)')
|
||
|
|
p.set_defaults(func=cmd_service)
|
||
|
|
|
||
|
|
# organize
|
||
|
|
p = sub.add_parser('organize', help='Organize completed docs into Domain/Subdomain folders')
|
||
|
|
p.add_argument('--manifest', help='Bulk migration using pre-built manifest JSON')
|
||
|
|
p.add_argument('--hash', help='Organize a single document by hash')
|
||
|
|
p.add_argument('--dry-run', action='store_true', help='Show what would happen without moving')
|
||
|
|
p.add_argument('--limit', type=int, help='Limit number of docs to organize')
|
||
|
|
p.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||
|
|
p.set_defaults(func=cmd_organize)
|
||
|
|
|
||
|
|
# ingest-peertube
|
||
|
|
p = sub.add_parser('ingest-peertube', help='Ingest PeerTube video transcripts')
|
||
|
|
p.add_argument('--channel', help='Ingest specific channel (actor_name)')
|
||
|
|
p.add_argument('--since', help='Only ingest videos published after this date (ISO format)')
|
||
|
|
p.add_argument('--stats', action='store_true', help='Show PeerTube instance stats')
|
||
|
|
p.add_argument('--enrich', action='store_true', help='Run enrichment after ingestion')
|
||
|
|
p.add_argument('--process', action='store_true', help='Full pipeline: ingest + enrich + embed')
|
||
|
|
p.set_defaults(func=cmd_ingest_peertube)
|
||
|
|
|
||
|
|
# ingest
|
||
|
|
p = sub.add_parser('ingest', help='Ingest intel data')
|
||
|
|
p.add_argument('--file', help='Ingest a specific JSON file')
|
||
|
|
p.add_argument('--directory', help='Ingest all JSON files from directory')
|
||
|
|
p.set_defaults(func=cmd_ingest)
|
||
|
|
|
||
|
|
|
||
|
|
# pipeline (Stream B)
|
||
|
|
p = sub.add_parser('pipeline', help='Stream B library pipeline (status, migrate, reverse, watch, sweep)')
|
||
|
|
p.add_argument('pipeline_action', nargs='?', default='status',
|
||
|
|
choices=['status', 'migrate', 'reverse', 'watch', 'sweep'],
|
||
|
|
help='Pipeline sub-action')
|
||
|
|
p.add_argument('--dry-run', action='store_true', help='Show what would happen without moving')
|
||
|
|
p.add_argument('--id', dest='operation_id', help='Operation ID for reverse')
|
||
|
|
p.add_argument('--domain', default=None, help='Domain name for migrate (default: Civil Organization)')
|
||
|
|
p.add_argument('--execute', action='store_true', help='Execute sweep plan')
|
||
|
|
p.add_argument('--resume', action='store_true', help='Resume sweep from checkpoint')
|
||
|
|
p.add_argument('--verify', action='store_true', help='Verify sweep results')
|
||
|
|
p.add_argument('--batch-size', type=int, default=500, help='Batch size for sweep execution')
|
||
|
|
p.add_argument('--max-entries', type=int, default=None, help='Max entries to process per invocation (for gated execution)')
|
||
|
|
p.add_argument('--plan-file', default=None, help='Path to sweep plan JSON (default: data/sweep/sweep_plan.json)')
|
||
|
|
p.set_defaults(func=cmd_pipeline)
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
if not args.command:
|
||
|
|
parser.print_help()
|
||
|
|
return 1
|
||
|
|
|
||
|
|
return args.func(args)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
sys.exit(main() or 0)
|