Phase 6f-2: format normalizer in dispatcher

Adds _normalize_formats() to the dispatcher that converts non-standard
document formats to PDF before dispatch. Supports:
- .epub, .mobi -> PDF via ebook-convert (Calibre)
- .doc, .docx -> PDF via LibreOffice headless

Called per-subfolder before _find_pairs() so _find_pairs() only ever
sees standard content files. Conversion failures are logged and
skipped -- the original file stays in acquired/ for manual review.

Also converts 3 staged epub files and cleans up _staging/.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-15 23:08:19 +00:00
commit f4659d155f

View file

@ -7,10 +7,12 @@ hands them to the appropriate processor's pre_flight().
Phase 3: importable one-shot dispatcher. Service-loop integration in Phase 5.
Phase 4: sidecar is optional (PDFs may arrive without .meta.json).
Phase 6f-2: format normalizer converts non-standard formats to PDF before dispatch.
"""
import importlib
import logging
import os
import subprocess
import time
from .utils import get_config
@ -21,6 +23,9 @@ logger = logging.getLogger("recon.dispatcher")
# Content file extensions recognized by the dispatcher
CONTENT_EXTENSIONS = {'.txt', '.vtt', '.html', '.pdf'}
# Non-standard formats that can be converted to PDF before dispatch
CONVERTIBLE_EXTENSIONS = {'.epub', '.mobi', '.doc', '.docx'}
def _load_processor(processor_name):
"""Dynamically import a processor module from lib.processors."""
@ -35,6 +40,63 @@ def _load_processor(processor_name):
return None
def _normalize_formats(subfolder_path):
"""Convert non-standard document formats to PDF before dispatch.
Walks the subfolder for files with convertible extensions (.epub, .mobi,
.doc, .docx). Converts each to PDF using the appropriate tool, then
deletes the original.
Returns count of files converted.
"""
if not os.path.isdir(subfolder_path):
return 0
converted = 0
for fname in sorted(os.listdir(subfolder_path)):
stem, ext = os.path.splitext(fname)
if ext.lower() not in CONVERTIBLE_EXTENSIONS:
continue
source = os.path.join(subfolder_path, fname)
target = os.path.join(subfolder_path, stem + '.pdf')
if os.path.exists(target):
logger.debug("Target PDF already exists, skipping: %s", fname)
continue
try:
if ext.lower() in ('.epub', '.mobi'):
subprocess.run(
['ebook-convert', source, target],
capture_output=True, check=True, timeout=300,
)
elif ext.lower() in ('.doc', '.docx'):
subprocess.run(
['libreoffice', '--headless', '--convert-to', 'pdf',
'--outdir', subfolder_path, source],
capture_output=True, check=True, timeout=300,
)
if os.path.isfile(target) and os.path.getsize(target) > 0:
os.remove(source)
converted += 1
logger.info("Converted %s -> %s.pdf", fname, stem)
else:
logger.warning("Conversion produced no output: %s", fname)
except subprocess.TimeoutExpired:
logger.error("Conversion timed out: %s", fname)
except subprocess.CalledProcessError as e:
logger.error("Conversion failed for %s: %s", fname,
e.stderr.decode(errors='replace')[:200] if e.stderr else str(e))
except Exception as e:
logger.error("Unexpected error converting %s: %s", fname, e)
return converted
def _find_pairs(subfolder_path):
"""Find content files (with optional sidecar) in a subfolder.
@ -115,6 +177,9 @@ def dispatch_once():
logger.error("Processor %s has no pre_flight function", processor_name)
continue
# Convert non-standard formats to PDF before scanning for pairs
_normalize_formats(subfolder_path)
pairs = _find_pairs(subfolder_path)
if not pairs:
continue