mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Migrate dashboard upload to pipeline with multi-format support
Upload handler now writes files to the appropriate hopper subfolder instead of copying directly to /mnt/library/: - .pdf -> acquired/pdf/ - .txt -> acquired/text/ - .epub, .doc, .docx, .mobi -> acquired/pdf/ (dispatcher format normalizer converts to PDF before processing) The dispatcher picks up files and routes through the appropriate processor (pdf_processor or text_processor) for full metadata voting, domain classification, and canonical filing. Changes to api_upload() / _process_upload(): - Relaxed extension check: PDF, TXT, EPUB, DOC, DOCX, MOBI - Routes to correct hopper subfolder by extension - Writes meta.json sidecar with original filename and category hint - Removed: direct library copy, add_to_catalogue, queue_document - Added: hopper-level dedup check (catches rapid re-uploads) - Kept: catalogue dedup check for immediate user feedback Changes to api_upload_status(): - Added fallback: checks acquired/ and processing/ dirs if hash not yet in documents table (covers gap between upload and dispatcher pickup) Template updated: accept attribute and help text now reflect multi-format support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
999cf37626
commit
e6224cb279
2 changed files with 69 additions and 44 deletions
106
lib/api.py
106
lib/api.py
|
|
@ -9,6 +9,7 @@ API endpoints for all pipeline operations including crawl, ingest, and search.
|
||||||
Dependencies: Flask, qdrant-client, requests
|
Dependencies: Flask, qdrant-client, requests
|
||||||
Config: web, vector_db, embedding sections of config.yaml
|
Config: web, vector_db, embedding sections of config.yaml
|
||||||
"""
|
"""
|
||||||
|
import glob
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
import os
|
import os
|
||||||
|
|
@ -77,25 +78,20 @@ def _format_source_citation(payload):
|
||||||
return book
|
return book
|
||||||
|
|
||||||
|
|
||||||
def _resolve_upload_path(category, config):
|
ALLOWED_EXTENSIONS = {'.pdf', '.txt', '.epub', '.doc', '.docx', '.mobi'}
|
||||||
"""Resolve the target directory for an upload given a category name."""
|
|
||||||
upload_paths = config.get('upload_paths', {})
|
|
||||||
library_root = config['library_root']
|
|
||||||
|
|
||||||
if category in upload_paths:
|
HOPPER_ROUTING = {
|
||||||
return upload_paths[category]
|
'.pdf': '/opt/recon/data/acquired/pdf/',
|
||||||
|
'.txt': '/opt/recon/data/acquired/text/',
|
||||||
default_path = upload_paths.get('default', library_root)
|
'.epub': '/opt/recon/data/acquired/pdf/',
|
||||||
safe_category = secure_filename(category) if category else ''
|
'.doc': '/opt/recon/data/acquired/pdf/',
|
||||||
if safe_category:
|
'.docx': '/opt/recon/data/acquired/pdf/',
|
||||||
return os.path.join(default_path, safe_category)
|
'.mobi': '/opt/recon/data/acquired/pdf/',
|
||||||
return default_path
|
}
|
||||||
|
|
||||||
|
|
||||||
def _process_upload(filepath, original_filename, category, config, db):
|
def _process_upload(filepath, original_filename, ext, category, config, db):
|
||||||
"""Process a single PDF upload: hash, dedup, copy to library, catalogue, queue."""
|
"""Process an upload: hash, dedup, drop into hopper for dispatcher pickup."""
|
||||||
library_root = config['library_root']
|
|
||||||
|
|
||||||
file_hash = content_hash(filepath)
|
file_hash = content_hash(filepath)
|
||||||
|
|
||||||
conn = db._get_conn()
|
conn = db._get_conn()
|
||||||
|
|
@ -103,34 +99,39 @@ def _process_upload(filepath, original_filename, category, config, db):
|
||||||
if existing:
|
if existing:
|
||||||
raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}")
|
raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}")
|
||||||
|
|
||||||
target_dir = _resolve_upload_path(category, config)
|
# Also check if already sitting in a hopper dir awaiting dispatch
|
||||||
os.makedirs(target_dir, exist_ok=True)
|
for hopper in HOPPER_ROUTING.values():
|
||||||
|
if any(os.path.exists(os.path.join(hopper, file_hash + e)) for e in ALLOWED_EXTENSIONS):
|
||||||
|
raise ValueError("Duplicate: file already queued for processing")
|
||||||
|
|
||||||
safe_name = secure_filename(original_filename)
|
hopper_dir = HOPPER_ROUTING.get(ext, '/opt/recon/data/acquired/pdf/')
|
||||||
if not safe_name:
|
os.makedirs(hopper_dir, exist_ok=True)
|
||||||
safe_name = f"{file_hash}.pdf"
|
|
||||||
target_path = os.path.join(target_dir, safe_name)
|
|
||||||
|
|
||||||
if os.path.exists(target_path):
|
target_path = os.path.join(hopper_dir, file_hash + ext)
|
||||||
base, ext = os.path.splitext(safe_name)
|
meta_path = os.path.join(hopper_dir, file_hash + '.meta.json')
|
||||||
target_path = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}")
|
|
||||||
|
stem = os.path.splitext(original_filename)[0]
|
||||||
|
sidecar = {
|
||||||
|
'title': stem,
|
||||||
|
'source': 'dashboard_upload',
|
||||||
|
'source_type': ext.lstrip('.'),
|
||||||
|
'category': category,
|
||||||
|
'original_filename': original_filename,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write sidecar first (with .tmp safety), then content
|
||||||
|
tmp_meta = meta_path + '.tmp'
|
||||||
|
with open(tmp_meta, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(sidecar, f, indent=2)
|
||||||
|
os.rename(tmp_meta, meta_path)
|
||||||
|
|
||||||
shutil.copy2(filepath, target_path)
|
shutil.copy2(filepath, target_path)
|
||||||
size = os.path.getsize(target_path)
|
|
||||||
|
|
||||||
source, derived_category = derive_source_and_category(target_path, library_root)
|
|
||||||
|
|
||||||
db.add_to_catalogue(file_hash, safe_name, target_path, size, source, derived_category)
|
|
||||||
db.queue_document(file_hash)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'hash': file_hash,
|
'hash': file_hash,
|
||||||
'filename': safe_name,
|
'filename': original_filename,
|
||||||
'category': derived_category,
|
'source_type': ext.lstrip('.'),
|
||||||
'source': source,
|
'status': 'queued',
|
||||||
'path': target_path,
|
|
||||||
'size_bytes': size,
|
|
||||||
'status': 'queued'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -346,22 +347,23 @@ def api_upload():
|
||||||
if not file.filename:
|
if not file.filename:
|
||||||
return jsonify({'error': 'No file selected'}), 400
|
return jsonify({'error': 'No file selected'}), 400
|
||||||
|
|
||||||
if not file.filename.lower().endswith('.pdf'):
|
ext = os.path.splitext(file.filename)[1].lower()
|
||||||
return jsonify({'error': 'Only PDF files are accepted'}), 400
|
if ext not in ALLOWED_EXTENSIONS:
|
||||||
|
return jsonify({'error': f'Unsupported file type: {ext}'}), 400
|
||||||
|
|
||||||
category = request.form.get('category', '').strip()
|
category = request.form.get('category', '').strip()
|
||||||
|
|
||||||
config = get_config()
|
config = get_config()
|
||||||
db = StatusDB()
|
db = StatusDB()
|
||||||
|
|
||||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.pdf')
|
tmp_fd, tmp_path = tempfile.mkstemp(suffix=ext)
|
||||||
try:
|
try:
|
||||||
file.save(tmp_path)
|
file.save(tmp_path)
|
||||||
|
|
||||||
if os.path.getsize(tmp_path) == 0:
|
if os.path.getsize(tmp_path) == 0:
|
||||||
return jsonify({'error': 'Uploaded file is empty'}), 400
|
return jsonify({'error': 'Uploaded file is empty'}), 400
|
||||||
|
|
||||||
result = _process_upload(tmp_path, file.filename, category, config, db)
|
result = _process_upload(tmp_path, file.filename, ext, category, config, db)
|
||||||
return jsonify(result), 201
|
return jsonify(result), 201
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
|
@ -390,6 +392,28 @@ def api_upload_status(doc_hash):
|
||||||
'filename': cat['filename'],
|
'filename': cat['filename'],
|
||||||
'status': cat['status'],
|
'status': cat['status'],
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Check hopper dirs for files awaiting dispatcher pickup
|
||||||
|
for hopper in ('/opt/recon/data/acquired/pdf/', '/opt/recon/data/acquired/text/'):
|
||||||
|
if glob.glob(os.path.join(hopper, doc_hash + '.*')):
|
||||||
|
return jsonify({
|
||||||
|
'hash': doc_hash,
|
||||||
|
'status': 'pending',
|
||||||
|
'message': 'Waiting for dispatcher',
|
||||||
|
})
|
||||||
|
|
||||||
|
# Check processing dir
|
||||||
|
proc_dir = os.path.join(
|
||||||
|
config.get('pipeline', {}).get('processing_root', '/opt/recon/data/processing'),
|
||||||
|
doc_hash,
|
||||||
|
)
|
||||||
|
if os.path.isdir(proc_dir):
|
||||||
|
return jsonify({
|
||||||
|
'hash': doc_hash,
|
||||||
|
'status': 'processing',
|
||||||
|
'message': 'Being processed',
|
||||||
|
})
|
||||||
|
|
||||||
return jsonify({'error': 'Document not found'}), 404
|
return jsonify({'error': 'Document not found'}), 404
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,13 @@
|
||||||
{% extends "base.html" %}
|
{% extends "base.html" %}
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<h3 class="section-title mb-16">Upload PDF</h3>
|
<h3 class="section-title mb-16">Upload Document</h3>
|
||||||
<div class="panel">
|
<div class="panel">
|
||||||
<form id="upload-form" enctype="multipart/form-data">
|
<form id="upload-form" enctype="multipart/form-data">
|
||||||
<div class="mb-16">
|
<div class="mb-16">
|
||||||
<label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">PDF File</label>
|
<label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">Document File</label>
|
||||||
<input type="file" name="file" accept=".pdf" id="upload-file"
|
<input type="file" name="file" accept=".pdf,.txt,.epub,.doc,.docx,.mobi" id="upload-file"
|
||||||
style="background:#0a0a0a;border:1px solid #333;color:#c0c0c0;padding:8px;width:100%;font-family:inherit;">
|
style="background:#0a0a0a;border:1px solid #333;color:#c0c0c0;padding:8px;width:100%;font-family:inherit;">
|
||||||
|
<span class="text-dim" style="font-size:11px;display:block;margin-top:4px;">Supported: PDF, TXT, EPUB, DOC, DOCX, MOBI</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="mb-16">
|
<div class="mb-16">
|
||||||
<label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">Category</label>
|
<label class="text-dim text-xs" style="text-transform:uppercase;display:block;margin-bottom:4px;">Category</label>
|
||||||
|
|
@ -67,7 +68,7 @@ document.getElementById('upload-form').addEventListener('submit', async function
|
||||||
result.innerHTML = '<span style="color:#00ff41;">Queued for processing</span><br>' +
|
result.innerHTML = '<span style="color:#00ff41;">Queued for processing</span><br>' +
|
||||||
'<span class="text-dim">Hash: ' + data.hash + '</span><br>' +
|
'<span class="text-dim">Hash: ' + data.hash + '</span><br>' +
|
||||||
'<span class="text-dim">File: ' + data.filename + '</span><br>' +
|
'<span class="text-dim">File: ' + data.filename + '</span><br>' +
|
||||||
'<span class="text-dim">Category: ' + data.source + '/' + data.category + '</span>';
|
'<span class="text-dim">Type: ' + data.source_type + '</span>';
|
||||||
fileInput.value = '';
|
fileInput.value = '';
|
||||||
} else {
|
} else {
|
||||||
status.style.color = '#ff4444';
|
status.style.color = '#ff4444';
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue