mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Switch domain assignment to Qdrant as source of truth
Replace on-disk concept file reads with Qdrant payload queries for domain assignment. This unlocks assignment for ~10,120 items that had missing or legacy-only concept files on disk while Qdrant held the correct 18-domain taxonomy data. Changes: - domain_assigner.py: Replace _count_concept_domains (disk) with _count_domains_from_qdrant and _count_domains_from_qdrant_batch (Qdrant scroll queries). Add _get_qdrant_client helper. Remove pass 3 defensive re-run (Qdrant reads are consistent). Add no_concepts terminal status for zero-vector documents. - embedder.py: Post-embed hook passes existing qdrant client to compute_assignment, avoiding a second connection. - recon.py: Backfill creates one QdrantClient for the batch. SQL filter includes existing needs_reprocess items. Dry-run reports no_concepts as separate bucket. --reprocess-missing removes concept-dir deletion step (no longer reads from disk). - docs/domain-assignment.md: Algorithm references Qdrant, documents no_concepts status, removes pass 3 description. Dry-run results: 20,453 assigned, 1,392 tied, 298 no_concepts, 0 needs_reprocess, 0 errors (previously 10,416 needs_reprocess). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c04ccc5011
commit
3b37d96c4d
4 changed files with 186 additions and 135 deletions
30
recon.py
30
recon.py
|
|
@ -865,6 +865,7 @@ def cmd_ingest(args):
|
|||
|
||||
def cmd_assign_categories(args):
|
||||
"""Assign RECON domains to PeerTube videos and push categories."""
|
||||
from qdrant_client import QdrantClient
|
||||
from lib.domain_assigner import compute_assignment, run_tiebreaker_pass
|
||||
from lib.peertube_writer import push_pending, extract_uuid
|
||||
from lib.recon_domains import DOMAIN_CATEGORY_MAP
|
||||
|
|
@ -876,11 +877,13 @@ def cmd_assign_categories(args):
|
|||
|
||||
if args.backfill:
|
||||
# Pass 1: assign domains to all complete stream docs with no assignment
|
||||
# or that previously got needs_reprocess
|
||||
conn = db._get_conn()
|
||||
q = """SELECT d.hash FROM documents d
|
||||
LEFT JOIN catalogue c ON d.hash = c.hash
|
||||
WHERE d.status = 'complete'
|
||||
AND d.recon_domain IS NULL
|
||||
AND (d.recon_domain IS NULL
|
||||
OR d.recon_domain_status = 'needs_reprocess')
|
||||
AND c.source = 'stream.echo6.co'
|
||||
ORDER BY d.discovered_at"""
|
||||
if limit:
|
||||
|
|
@ -895,10 +898,17 @@ def cmd_assign_categories(args):
|
|||
print(f"Backfill: processing {len(hashes)} documents" +
|
||||
(" [DRY RUN]" if dry_run else ""))
|
||||
|
||||
stats = {'assigned': 0, 'tied_pass_1': 0, 'needs_reprocess': 0, 'errors': 0}
|
||||
# Create one Qdrant client for the entire backfill
|
||||
qdrant = QdrantClient(
|
||||
host=config['vector_db']['host'],
|
||||
port=config['vector_db']['port'],
|
||||
timeout=60
|
||||
)
|
||||
|
||||
stats = {'assigned': 0, 'tied_pass_1': 0, 'no_concepts': 0, 'needs_reprocess': 0, 'errors': 0}
|
||||
for i, file_hash in enumerate(hashes):
|
||||
try:
|
||||
domain, status = compute_assignment(file_hash, db, config)
|
||||
domain, status = compute_assignment(file_hash, db, config, qdrant=qdrant)
|
||||
stats[status] = stats.get(status, 0) + 1
|
||||
if not dry_run:
|
||||
db.set_domain_assignment(file_hash, domain, status)
|
||||
|
|
@ -946,22 +956,10 @@ def cmd_assign_categories(args):
|
|||
for item in items:
|
||||
file_hash = item['hash']
|
||||
if dry_run:
|
||||
concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
|
||||
has_concepts = os.path.isdir(concepts_dir)
|
||||
concept_count = len(os.listdir(concepts_dir)) if has_concepts else 0
|
||||
detail = f"DELETE {concept_count} concept files" if has_concepts else "no concept dir"
|
||||
print(f" Would reprocess: {file_hash[:12]} — {item.get('filename', '?')} ({detail})")
|
||||
print(f" Would reprocess: {file_hash[:12]} — {item.get('filename', '?')}")
|
||||
requeued += 1
|
||||
continue
|
||||
|
||||
# Remove stale concept files
|
||||
import shutil
|
||||
concepts_dir = os.path.join(config['paths']['concepts'], file_hash)
|
||||
if os.path.isdir(concepts_dir):
|
||||
logger.info(f" Deleting concept dir: {concepts_dir} "
|
||||
f"({len(os.listdir(concepts_dir))} files, hash={file_hash})")
|
||||
shutil.rmtree(concepts_dir)
|
||||
|
||||
# Reset document status to allow re-processing
|
||||
conn = db._get_conn()
|
||||
conn.execute(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue