mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Phase 6f-2: format normalizer in dispatcher
Adds _normalize_formats() to the dispatcher that converts non-standard document formats to PDF before dispatch. Supports: - .epub, .mobi -> PDF via ebook-convert (Calibre) - .doc, .docx -> PDF via LibreOffice headless Called per-subfolder before _find_pairs() so _find_pairs() only ever sees standard content files. Conversion failures are logged and skipped -- the original file stays in acquired/ for manual review. Also converts 3 staged epub files and cleans up _staging/. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
62539861f2
commit
f4659d155f
1 changed files with 65 additions and 0 deletions
|
|
@ -7,10 +7,12 @@ hands them to the appropriate processor's pre_flight().
|
|||
|
||||
Phase 3: importable one-shot dispatcher. Service-loop integration in Phase 5.
|
||||
Phase 4: sidecar is optional (PDFs may arrive without .meta.json).
|
||||
Phase 6f-2: format normalizer converts non-standard formats to PDF before dispatch.
|
||||
"""
|
||||
import importlib
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from .utils import get_config
|
||||
|
|
@ -21,6 +23,9 @@ logger = logging.getLogger("recon.dispatcher")
|
|||
# Content file extensions recognized by the dispatcher
|
||||
CONTENT_EXTENSIONS = {'.txt', '.vtt', '.html', '.pdf'}
|
||||
|
||||
# Non-standard formats that can be converted to PDF before dispatch
|
||||
CONVERTIBLE_EXTENSIONS = {'.epub', '.mobi', '.doc', '.docx'}
|
||||
|
||||
|
||||
def _load_processor(processor_name):
|
||||
"""Dynamically import a processor module from lib.processors."""
|
||||
|
|
@ -35,6 +40,63 @@ def _load_processor(processor_name):
|
|||
return None
|
||||
|
||||
|
||||
def _normalize_formats(subfolder_path):
|
||||
"""Convert non-standard document formats to PDF before dispatch.
|
||||
|
||||
Walks the subfolder for files with convertible extensions (.epub, .mobi,
|
||||
.doc, .docx). Converts each to PDF using the appropriate tool, then
|
||||
deletes the original.
|
||||
|
||||
Returns count of files converted.
|
||||
"""
|
||||
if not os.path.isdir(subfolder_path):
|
||||
return 0
|
||||
|
||||
converted = 0
|
||||
|
||||
for fname in sorted(os.listdir(subfolder_path)):
|
||||
stem, ext = os.path.splitext(fname)
|
||||
if ext.lower() not in CONVERTIBLE_EXTENSIONS:
|
||||
continue
|
||||
|
||||
source = os.path.join(subfolder_path, fname)
|
||||
target = os.path.join(subfolder_path, stem + '.pdf')
|
||||
|
||||
if os.path.exists(target):
|
||||
logger.debug("Target PDF already exists, skipping: %s", fname)
|
||||
continue
|
||||
|
||||
try:
|
||||
if ext.lower() in ('.epub', '.mobi'):
|
||||
subprocess.run(
|
||||
['ebook-convert', source, target],
|
||||
capture_output=True, check=True, timeout=300,
|
||||
)
|
||||
elif ext.lower() in ('.doc', '.docx'):
|
||||
subprocess.run(
|
||||
['libreoffice', '--headless', '--convert-to', 'pdf',
|
||||
'--outdir', subfolder_path, source],
|
||||
capture_output=True, check=True, timeout=300,
|
||||
)
|
||||
|
||||
if os.path.isfile(target) and os.path.getsize(target) > 0:
|
||||
os.remove(source)
|
||||
converted += 1
|
||||
logger.info("Converted %s -> %s.pdf", fname, stem)
|
||||
else:
|
||||
logger.warning("Conversion produced no output: %s", fname)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Conversion timed out: %s", fname)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error("Conversion failed for %s: %s", fname,
|
||||
e.stderr.decode(errors='replace')[:200] if e.stderr else str(e))
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error converting %s: %s", fname, e)
|
||||
|
||||
return converted
|
||||
|
||||
|
||||
def _find_pairs(subfolder_path):
|
||||
"""Find content files (with optional sidecar) in a subfolder.
|
||||
|
||||
|
|
@ -115,6 +177,9 @@ def dispatch_once():
|
|||
logger.error("Processor %s has no pre_flight function", processor_name)
|
||||
continue
|
||||
|
||||
# Convert non-standard formats to PDF before scanning for pairs
|
||||
_normalize_formats(subfolder_path)
|
||||
|
||||
pairs = _find_pairs(subfolder_path)
|
||||
if not pairs:
|
||||
continue
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue