Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-14 14:57:23 +00:00
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions

67
run-pipeline-now.sh Executable file
View file

@ -0,0 +1,67 @@
#!/bin/bash
# RECON Pipeline — Skip scan, run extract + enrich in parallel, then embed
# Scan already completed (10,162 catalogued). 6,211 extracted, 3,603 queued.
set -euo pipefail
cd /opt/recon
source venv/bin/activate
LOGDIR="logs"
mkdir -p "$LOGDIR"
TS=$(date +%Y%m%d_%H%M%S)
MAIN_LOG="$LOGDIR/pipeline_${TS}.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$MAIN_LOG"
}
log "=== RECON Pipeline (parallel extract+enrich) ==="
log "Skipping scan (already done). Starting extract + enrich concurrently."
# Reset any stuck docs from previous kill
sqlite3 data/recon.db "UPDATE documents SET status='queued' WHERE status='extracting';"
sqlite3 data/recon.db "UPDATE documents SET status='extracted' WHERE status='enriching';"
sqlite3 data/recon.db "UPDATE documents SET status='enriched' WHERE status='embedding';"
# Status before
log "Before:"
sqlite3 data/recon.db "SELECT status, COUNT(*) FROM documents GROUP BY status;" | while read line; do log " $line"; done
# Start extract and enrich in parallel
log "--- Starting Extract (4 workers) + Enrich (16 workers) ---"
python3 recon.py extract --workers 4 >> "$LOGDIR/extract_${TS}.log" 2>&1 &
EXTRACT_PID=$!
log " Extract PID: $EXTRACT_PID"
sleep 3
python3 recon.py enrich --workers 16 >> "$LOGDIR/enrich_${TS}.log" 2>&1 &
ENRICH_PID=$!
log " Enrich PID: $ENRICH_PID"
# Monitor loop — report progress every 5 minutes
while kill -0 $EXTRACT_PID 2>/dev/null || kill -0 $ENRICH_PID 2>/dev/null; do
sleep 300
STATS=$(sqlite3 data/recon.db "SELECT status, COUNT(*) FROM documents GROUP BY status;" | tr '\n' ' ')
log " Progress: $STATS"
done
log " Extract + Enrich finished"
# Second enrich pass (catch docs extracted during first enrich)
REMAINING=$(sqlite3 data/recon.db "SELECT COUNT(*) FROM documents WHERE status='extracted';")
if [ "$REMAINING" -gt 0 ]; then
log "--- Enrich pass 2: $REMAINING remaining ---"
python3 recon.py enrich --workers 16 >> "$LOGDIR/enrich_${TS}.log" 2>&1
log " Pass 2 complete"
fi
# Embed
log "--- Embed ---"
python3 recon.py embed --workers 4 >> "$LOGDIR/embed_${TS}.log" 2>&1
log " Embed complete"
log "=== Pipeline Complete ==="
python3 recon.py status 2>&1 | tee -a "$MAIN_LOG"
log "Finished: $(date)"