Phase 3: dispatcher, transcript processor, text_dir resolution

- lib/dispatcher.py: one-shot dispatcher that scans acquired/<type>/ for content+sidecar pairs and routes to registered processors - lib/processors/transcript_processor.py: pre_flight() for transcripts (hash, dedupe, split into pages, register in DB, set text_dir) - lib/utils.py: resolve_text_dir() helper for text_dir column fallback - lib/enricher.py: use resolve_text_dir() instead of hardcoded path - lib/embedder.py: use resolve_text_dir() instead of hardcoded path - lib/processors/__init__.py, lib/acquisition/__init__.py: package inits Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-14 15:39:42 +00:00 · 2026-04-14 15:39:42 +00:00 · 66fadb7487
commit 66fadb7487
parent de2c59a501
7 changed files with 293 additions and 2 deletions
--- a/lib/embedder.py
+++ b/lib/embedder.py
@ -21,6 +21,7 @@ from qdrant_client.models import PointStruct, SparseVector

 from .utils import get_config, concept_id, generate_download_url, setup_logging
 from .status import StatusDB
+from .utils import resolve_text_dir

 logger = setup_logging('recon.embedder')

@ -274,7 +275,7 @@ def embed_single(file_hash, db, config):
        source_type = 'web' if is_web else 'document'

        # Check meta.json for explicit source_type (e.g. 'transcript')
-        text_dir = os.path.join(config['paths']['text'], file_hash)
+        text_dir = resolve_text_dir(file_hash, config, db)
        meta_path = os.path.join(text_dir, 'meta.json')
        page_timestamps = {}
        if os.path.exists(meta_path):