mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Phase 3: dispatcher, transcript processor, text_dir resolution
- lib/dispatcher.py: one-shot dispatcher that scans acquired/<type>/ for content+sidecar pairs and routes to registered processors - lib/processors/transcript_processor.py: pre_flight() for transcripts (hash, dedupe, split into pages, register in DB, set text_dir) - lib/utils.py: resolve_text_dir() helper for text_dir column fallback - lib/enricher.py: use resolve_text_dir() instead of hardcoded path - lib/embedder.py: use resolve_text_dir() instead of hardcoded path - lib/processors/__init__.py, lib/acquisition/__init__.py: package inits Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
de2c59a501
commit
66fadb7487
7 changed files with 293 additions and 2 deletions
16
lib/utils.py
16
lib/utils.py
|
|
@ -388,3 +388,19 @@ def generate_download_url(filepath, library_root='/mnt/library', base_url='https
|
|||
parts = rel.split(os.sep)
|
||||
encoded = '/'.join(quote(p) for p in parts)
|
||||
return f"{base_url}/{encoded}"
|
||||
|
||||
|
||||
def resolve_text_dir(file_hash, config, db=None):
|
||||
"""Resolve the text directory for a document.
|
||||
|
||||
If db is provided and documents.text_dir is set for this hash, use that.
|
||||
Otherwise fall back to the legacy location: config['paths']['text']/{hash}/
|
||||
"""
|
||||
if db is not None:
|
||||
conn = db._get_conn()
|
||||
row = conn.execute(
|
||||
"SELECT text_dir FROM documents WHERE hash = ?", (file_hash,)
|
||||
).fetchone()
|
||||
if row and row['text_dir']:
|
||||
return row['text_dir']
|
||||
return os.path.join(config['paths']['text'], file_hash)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue