Phase 4: Phase 3 cleanup fixes

Fix 1.1: filing preserves source file extension instead of defaulting to .pdf
Fix 1.2: back-fixed soldering transcript from .pdf to .txt (hash 380dbc78)
Fix 1.3: dispatcher logs missing processor modules at DEBUG, not ERROR
Fix 1.4: transcript processor cleans stale processing/concepts dirs on entry
Also: dispatcher now handles solo content files without .meta.json sidecar

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-14 16:39:57 +00:00
commit 9fe6a0a782
3 changed files with 56 additions and 17 deletions

View file

@ -6,6 +6,7 @@ that have been stable (mtime unchanged) for the configured threshold, then
hands them to the appropriate processor's pre_flight().
Phase 3: importable one-shot dispatcher. Service-loop integration in Phase 5.
Phase 4: sidecar is optional (PDFs may arrive without .meta.json).
"""
import importlib
import logging
@ -17,37 +18,44 @@ from .status import StatusDB
logger = logging.getLogger("recon.dispatcher")
# Content file extensions recognized by the dispatcher
CONTENT_EXTENSIONS = {'.txt', '.vtt', '.html', '.pdf'}
def _load_processor(processor_name):
"""Dynamically import a processor module from lib.processors."""
module_path = f"lib.processors.{processor_name}"
try:
return importlib.import_module(module_path)
except ModuleNotFoundError:
logger.debug("Processor module not found: %s (not yet implemented)", processor_name)
return None
except ImportError as e:
logger.error("Cannot load processor %s: %s", processor_name, e)
logger.error("Failed to import processor %s: %s", processor_name, e)
return None
def _find_pairs(subfolder_path):
"""Find content+sidecar pairs in a subfolder.
"""Find content files (with optional sidecar) in a subfolder.
A pair is two files sharing a basename:
<basename>.txt (or other content extension)
<basename>.meta.json (sidecar)
A pair is:
<basename>.<ext> content file
<basename>.meta.json optional sidecar
Returns list of (content_path, meta_path, basename) tuples.
Returns list of (content_path, meta_path_or_None, basename) tuples.
"""
if not os.path.isdir(subfolder_path):
return []
files = set(os.listdir(subfolder_path))
pairs = []
seen_basenames = set()
# First pass: find .meta.json files and their matching content
for fname in sorted(files):
if fname.endswith('.meta.json'):
basename = fname[:-len('.meta.json')]
# Look for matching content file (try common extensions)
for ext in ['.txt', '.vtt', '.html', '.pdf']:
for ext in sorted(CONTENT_EXTENSIONS):
content_name = basename + ext
if content_name in files:
pairs.append((
@ -55,8 +63,21 @@ def _find_pairs(subfolder_path):
os.path.join(subfolder_path, fname),
basename,
))
seen_basenames.add(content_name)
break
# Second pass: find solo content files (no sidecar)
for fname in sorted(files):
if fname in seen_basenames:
continue
_stem, ext = os.path.splitext(fname)
if ext.lower() in CONTENT_EXTENSIONS and not fname.endswith('.meta.json'):
pairs.append((
os.path.join(subfolder_path, fname),
None,
_stem,
))
return pairs
@ -99,10 +120,12 @@ def dispatch_once():
continue
for content_path, meta_path, basename in pairs:
# Both files must be stable
if not (_is_stable(content_path, stability_seconds) and
_is_stable(meta_path, stability_seconds)):
logger.debug("Pair %s not yet stable, skipping", basename)
# Content file must be stable; sidecar too if present
if not _is_stable(content_path, stability_seconds):
logger.debug("File %s not yet stable, skipping", basename)
continue
if meta_path and not _is_stable(meta_path, stability_seconds):
logger.debug("Sidecar for %s not yet stable, skipping", basename)
continue
logger.info("Dispatching %s/%s to %s", subfolder_name, basename, processor_name)