Extract _full_zim_cleanup helper, add SIGHUP + scrape_jobs cleanup

- Extract shared _full_zim_cleanup(source_id) from api_kiwix_remove
- Add SIGHUP to kiwix-serve after kiwix-manage remove
- Delete linked scrape_jobs rows during ZIM removal
- Update api_scraper_delete to do full ZIM cleanup when applicable
- Set chromium_path for single-file browser crawl support
- Add status.db to .gitignore

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-19 02:28:49 +00:00
commit f0b160ef7c
3 changed files with 87 additions and 12 deletions

1
.gitignore vendored
View file

@ -27,3 +27,4 @@ recon.db
# Kiwix binary tools (installed from tarball) # Kiwix binary tools (installed from tarball)
bin/ bin/
status.db

View file

@ -487,7 +487,7 @@ scraper:
# SingleFile CLI settings (browser crawl mode) # SingleFile CLI settings (browser crawl mode)
singlefile: singlefile:
executable: single-file executable: single-file
chromium_path: "" # Auto-detected from Playwright if empty chromium_path: "/usr/bin/chromium-browser"
crawl_max_depth: 10 crawl_max_depth: 10
# Stream B: New Library Pipeline # Stream B: New Library Pipeline

View file

@ -2060,23 +2060,24 @@ def api_kiwix_upload():
@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST']) def _full_zim_cleanup(source_id):
def api_kiwix_remove(source_id): """Full ZIM cleanup: Qdrant vectors, DB records, kiwix-manage, SIGHUP, file delete.
"""Remove a ZIM source: delete vectors, DB records, library entry, and file.""" Returns dict with results. Caller handles cache refresh."""
import subprocess import subprocess
import signal
import requests as req import requests as req
db = StatusDB() db = StatusDB()
conn = db._get_conn() conn = db._get_conn()
row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone() row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
if not row: if not row:
return jsonify({'error': 'Source not found'}), 404 return None
zim_source = dict(row) zim_source = dict(row)
zim_filename = zim_source['zim_filename'] zim_filename = zim_source['zim_filename']
zim_path = zim_source['zim_path'] zim_path = zim_source['zim_path']
zim_title = zim_source.get('title', zim_filename) zim_title = zim_source.get('title', zim_filename)
results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False} results = {'vectors_deleted': 0, 'docs_deleted': 0, 'file_deleted': False, 'scrape_jobs_deleted': 0}
# Step 1: Find all document hashes for this ZIM source # Step 1: Find all document hashes for this ZIM source
doc_hashes = [r['hash'] for r in conn.execute( doc_hashes = [r['hash'] for r in conn.execute(
@ -2135,7 +2136,6 @@ def api_kiwix_remove(source_id):
# Step 4: Remove from kiwix-serve library # Step 4: Remove from kiwix-serve library
try: try:
# Get the book ID from library.xml
subprocess.run( subprocess.run(
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')], ['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove', zim_filename.replace('.zim', '')],
capture_output=True, text=True, timeout=10 capture_output=True, text=True, timeout=10
@ -2143,6 +2143,16 @@ def api_kiwix_remove(source_id):
except Exception as e: except Exception as e:
logger.warning(f"kiwix-manage remove failed: {e}") logger.warning(f"kiwix-manage remove failed: {e}")
# Step 4b: SIGHUP kiwix-serve to reload library
try:
result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
if result.returncode == 0 and result.stdout.strip():
pid = int(result.stdout.strip().split()[0])
os.kill(pid, signal.SIGHUP)
logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
except Exception as e:
logger.warning(f"Failed to signal kiwix-serve: {e}")
# Step 5: Delete the ZIM file # Step 5: Delete the ZIM file
if os.path.isfile(zim_path): if os.path.isfile(zim_path):
try: try:
@ -2152,13 +2162,37 @@ def api_kiwix_remove(source_id):
logger.warning(f"ZIM file delete failed: {e}") logger.warning(f"ZIM file delete failed: {e}")
results['file_deleted'] = False results['file_deleted'] = False
# Step 6: Delete any linked scrape_jobs rows
try:
res = conn.execute("DELETE FROM scrape_jobs WHERE zim_source_id = ?", (source_id,))
conn.commit()
results['scrape_jobs_deleted'] = res.rowcount
except Exception as e:
logger.warning(f"scrape_jobs cleanup failed: {e}")
logger.info(f"Full ZIM cleanup for source {source_id} ('{zim_title}'): {results}")
return results
@app.route('/api/kiwix/remove/<int:source_id>', methods=['POST'])
def api_kiwix_remove(source_id):
"""Remove a ZIM source: delete vectors, DB records, library entry, and file."""
db = StatusDB()
conn = db._get_conn()
row = conn.execute("SELECT * FROM zim_sources WHERE id = ?", (source_id,)).fetchone()
if not row:
return jsonify({'error': 'Source not found'}), 404
results = _full_zim_cleanup(source_id)
if results is None:
return jsonify({'error': 'Source not found during cleanup'}), 404
# Refresh cache # Refresh cache
try: try:
_cache['kiwix_sources'] = _build_kiwix_sources() _cache['kiwix_sources'] = _build_kiwix_sources()
except Exception: except Exception:
pass pass
logger.info(f"Removed ZIM source '{zim_title}': {results}")
return jsonify({'ok': True, 'results': results}) return jsonify({'ok': True, 'results': results})
@ -2375,20 +2409,60 @@ def api_scraper_retry(job_id):
@app.route('/api/scraper/delete/<int:job_id>', methods=['POST']) @app.route('/api/scraper/delete/<int:job_id>', methods=['POST'])
def api_scraper_delete(job_id): def api_scraper_delete(job_id):
"""Delete a scrape job (only if not currently running).""" """Delete a scrape job and clean up any associated ZIM artifacts."""
import subprocess
import signal
db = StatusDB() db = StatusDB()
job = db.get_scrape_job(job_id) job = db.get_scrape_job(job_id)
if not job: if not job:
return jsonify({'error': 'Job not found'}), 404 return jsonify({'error': 'Job not found'}), 404
if job['status'] == 'running': if job['status'] == 'running':
return jsonify({'error': 'Cannot delete a running job cancel it first'}), 400 return jsonify({'error': 'Cannot delete a running job \u2014 cancel it first'}), 400
zim_cleanup_results = None
# If the job has a linked zim_source, do full cleanup
if job.get('zim_source_id'):
zim_cleanup_results = _full_zim_cleanup(job['zim_source_id'])
try:
_cache['kiwix_sources'] = _build_kiwix_sources()
except Exception:
pass
elif job.get('zim_filename'):
# No zim_source row, but there may be an orphan file + library entry
zim_path = os.path.join('/mnt/kiwix', job['zim_filename'])
if os.path.isfile(zim_path):
try:
os.remove(zim_path)
logger.info(f"Deleted orphan ZIM file: {zim_path}")
except Exception as e:
logger.warning(f"Failed to delete orphan ZIM file {zim_path}: {e}")
try:
subprocess.run(
['/opt/recon/bin/kiwix-manage', '/mnt/kiwix/library.xml', 'remove',
job['zim_filename'].replace('.zim', '')],
capture_output=True, text=True, timeout=10
)
except Exception as e:
logger.warning(f"kiwix-manage remove failed for orphan: {e}")
try:
result = subprocess.run(['pidof', 'kiwix-serve'], capture_output=True, text=True, timeout=5)
if result.returncode == 0 and result.stdout.strip():
pid = int(result.stdout.strip().split()[0])
os.kill(pid, signal.SIGHUP)
logger.info(f"Sent SIGHUP to kiwix-serve (pid {pid})")
except Exception as e:
logger.warning(f"Failed to signal kiwix-serve: {e}")
# Delete the scrape_jobs row (may already be gone if _full_zim_cleanup deleted it)
conn = db._get_conn() conn = db._get_conn()
conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,)) conn.execute("DELETE FROM scrape_jobs WHERE id = ?", (job_id,))
conn.commit() conn.commit()
logger.info(f"Scraper job {job_id} deleted")
return jsonify({'ok': True}) logger.info(f"Scraper job {job_id} deleted (zim_cleanup={zim_cleanup_results})")
return jsonify({'ok': True, 'zim_cleanup': zim_cleanup_results})
@app.route('/api/scraper/clear-failed', methods=['POST']) @app.route('/api/scraper/clear-failed', methods=['POST'])