Phase 6f-2: format normalizer in dispatcher

Adds _normalize_formats() to the dispatcher that converts non-standard document formats to PDF before dispatch. Supports: - .epub, .mobi -> PDF via ebook-convert (Calibre) - .doc, .docx -> PDF via LibreOffice headless Called per-subfolder before _find_pairs() so _find_pairs() only ever sees standard content files. Conversion failures are logged and skipped -- the original file stays in acquired/ for manual review. Also converts 3 staged epub files and cleans up _staging/. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-20 06:34:40 +02:00 · 2026-04-15 23:08:19 +00:00 · 2026-04-15 23:08:19 +00:00 · f4659d155f
commit f4659d155f
parent 62539861f2
1 changed files with 65 additions and 0 deletions
--- a/lib/dispatcher.py
+++ b/lib/dispatcher.py
@ -7,10 +7,12 @@ hands them to the appropriate processor's pre_flight().

 Phase 3: importable one-shot dispatcher. Service-loop integration in Phase 5.
 Phase 4: sidecar is optional (PDFs may arrive without .meta.json).
+Phase 6f-2: format normalizer converts non-standard formats to PDF before dispatch.
 """
 import importlib
 import logging
 import os
+import subprocess
 import time

 from .utils import get_config
@ -21,6 +23,9 @@ logger = logging.getLogger("recon.dispatcher")
 # Content file extensions recognized by the dispatcher
 CONTENT_EXTENSIONS = {'.txt', '.vtt', '.html', '.pdf'}

+# Non-standard formats that can be converted to PDF before dispatch
+CONVERTIBLE_EXTENSIONS = {'.epub', '.mobi', '.doc', '.docx'}
+

 def _load_processor(processor_name):
    """Dynamically import a processor module from lib.processors."""
@ -35,6 +40,63 @@ def _load_processor(processor_name):
        return None


+def _normalize_formats(subfolder_path):
+    """Convert non-standard document formats to PDF before dispatch.
+
+    Walks the subfolder for files with convertible extensions (.epub, .mobi,
+    .doc, .docx). Converts each to PDF using the appropriate tool, then
+    deletes the original.
+
+    Returns count of files converted.
+    """
+    if not os.path.isdir(subfolder_path):
+        return 0
+
+    converted = 0
+
+    for fname in sorted(os.listdir(subfolder_path)):
+        stem, ext = os.path.splitext(fname)
+        if ext.lower() not in CONVERTIBLE_EXTENSIONS:
+            continue
+
+        source = os.path.join(subfolder_path, fname)
+        target = os.path.join(subfolder_path, stem + '.pdf')
+
+        if os.path.exists(target):
+            logger.debug("Target PDF already exists, skipping: %s", fname)
+            continue
+
+        try:
+            if ext.lower() in ('.epub', '.mobi'):
+                subprocess.run(
+                    ['ebook-convert', source, target],
+                    capture_output=True, check=True, timeout=300,
+                )
+            elif ext.lower() in ('.doc', '.docx'):
+                subprocess.run(
+                    ['libreoffice', '--headless', '--convert-to', 'pdf',
+                     '--outdir', subfolder_path, source],
+                    capture_output=True, check=True, timeout=300,
+                )
+
+            if os.path.isfile(target) and os.path.getsize(target) > 0:
+                os.remove(source)
+                converted += 1
+                logger.info("Converted %s -> %s.pdf", fname, stem)
+            else:
+                logger.warning("Conversion produced no output: %s", fname)
+
+        except subprocess.TimeoutExpired:
+            logger.error("Conversion timed out: %s", fname)
+        except subprocess.CalledProcessError as e:
+            logger.error("Conversion failed for %s: %s", fname,
+                         e.stderr.decode(errors='replace')[:200] if e.stderr else str(e))
+        except Exception as e:
+            logger.error("Unexpected error converting %s: %s", fname, e)
+
+    return converted
+
+
 def _find_pairs(subfolder_path):
    """Find content files (with optional sidecar) in a subfolder.

@ -115,6 +177,9 @@ def dispatch_once():
            logger.error("Processor %s has no pre_flight function", processor_name)
            continue

+        # Convert non-standard formats to PDF before scanning for pairs
+        _normalize_formats(subfolder_path)
+
        pairs = _find_pairs(subfolder_path)
        if not pairs:
            continue