commit 563c16bb717d99ae3b7906b730897460c88dec29 Author: Matt Date: Tue Apr 14 14:57:23 2026 +0000 Initial commit: RECON codebase baseline Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..238cabb --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +# Python +venv/ +__pycache__/ +*.pyc +*.pyo + +# Secrets +.env + +# Runtime data +data/ +logs/ +pipeline.log +recon.db + +# Backups +*.bak +*.bak-* +*.bak.* +*.bak2.* + +# Junk +-.png + +# OS +.DS_Store diff --git a/PROJECT-BIBLE.md b/PROJECT-BIBLE.md new file mode 100644 index 0000000..1348cf0 --- /dev/null +++ b/PROJECT-BIBLE.md @@ -0,0 +1,785 @@ +# RECON Project Bible v2.0 + +*Last updated: 2026-02-16* + +--- + +## 1. Mission Statement + +RECON (Reconnaissance, Extraction, Conceptualization, and Operationalization of kNowledge) is a knowledge extraction pipeline that processes PDFs and web content into structured concepts stored in a Qdrant vector database. These concepts power Aurora, the RAG-enabled AI assistant running on OpenWebUI. + +**The core loop:** Content in (PDF/web) -> Text extracted -> Concepts enriched (Gemini) -> Vectors embedded (TEI/BGE-M3) -> Searchable knowledge (Qdrant) -> Aurora answers questions with citations. + +--- + +## 2. Infrastructure + +### Hosts + +| Host | IP (Tailscale) | Role | +|------|---------------|------| +| recon LXC | 100.64.0.24 (CT 130 on toc) | RECON application, dashboard, pipeline | +| cortex VM | 100.64.0.14 (VM 150 on toc) | Qdrant, TEI, Ollama, OpenWebUI | +| pi-nas | 100.64.0.21 (192.168.1.245) | NFS file server for PDF library | +| Contabo VPS | 100.64.0.1 (5.189.158.149) | Backup destination | + +### Services on cortex (100.64.0.14) + +| Service | Port | Purpose | +|---------|------|---------| +| Qdrant | 6333 | Vector database (recon_knowledge collection) | +| TEI (text-embeddings-inference) | 8090 | Embedding server (bge-m3, 1024-dim, ~1,711 emb/sec) | +| Ollama | 11434 | LLM server + fallback embeddings (~8 emb/sec) | +| OpenWebUI | 8080 | Aurora chat interface (ai.echo6.co) | + +### Services on recon LXC (100.64.0.24) + +| Service | Port | Purpose | +|---------|------|---------| +| RECON Dashboard | 8420 | Web UI + API for pipeline management | +| File Server | 8888 | PDF downloads (files.echo6.co) | + +### NFS Mount + +``` +pi-nas:/export/library -> /mnt/library (22TB, rw, NFSv3) +``` + +Contains ~13,000+ PDFs across: +- `Survival-Companion-Library/` (~12,900 PDFs in ~220 subdirectories) +- `Army_Pubs/` (~160 military field manuals) +- Other: `Gaming/`, `Reference/`, `Technical/` + +--- + +## 3. Architecture Overview + +``` + /mnt/library/ (NFS) + | + [recon scan] + | + catalogue (SQLite) + | + [recon queue] + | + +-----------+ [recon extract] +-----------+ + | PyPDF2 |--> data/text/ | Gemini | + | pdftotext | {hash}/page_N.txt | Flash | + | tesseract | | | 4 keys | + +-----------+ [recon enrich] +-----------+ + | + data/concepts/ + {hash}/window_N.json + | + [recon embed] + | + +----------+-----------+ + | TEI (primary) | + | bge-m3, 1024-dim | + | 1,711 emb/sec | + +----------+-----------+ + | + Qdrant (cortex:6333) + recon_knowledge collection + | + Aurora (OpenWebUI) + RAG search + citations +``` + +### Web Content Path + +``` + URL(s) ──> [recon ingest-url / crawl] + | + trafilatura extraction + chunk into ~2000-word pages + | + data/text/{hash}/page_N.txt + (enters at "extracted" status) + | + [enrich] -> [embed] + (same as PDF path) +``` + +--- + +## 4. Pipeline Stages + +### Status Flow + +``` +catalogued -> queued -> extracting -> extracted -> enriching -> enriched -> embedding -> complete + \-> failed +``` + +Web content enters at `extracted` status (text already extracted by trafilatura). + +### Stage Details + +| Stage | Tool | Input | Output | Speed | +|-------|------|-------|--------|-------| +| Scan | `recon scan` | /mnt/library/*.pdf | catalogue table | ~13K PDFs in ~30 min | +| Queue | `recon queue` | catalogue entries | documents table (status=queued) | Instant | +| Extract | `recon extract` | PDF files | data/text/{hash}/page_NNNN.txt | 4 workers, ~200/hr | +| Enrich | `recon enrich` | Text pages (10-page windows) | data/concepts/{hash}/window_N.json | 16 workers, 4 Gemini keys | +| Embed | `recon embed` | Concept JSONs | Qdrant vectors | TEI: 1,711 emb/sec | + +### Extraction Fallback Chain + +1. **PyPDF2** (fast, clean text) -> 2. **pdftotext** (handles complex layouts) -> 3. **Tesseract OCR** (scanned documents) + +### Enrichment Details + +- Model: `gemini-2.0-flash` +- Window size: 10 pages per API call (configurable) +- Workers: 16 concurrent (4 API keys x 4 workers each) +- Output format: JSON array of concept objects +- **CRITICAL**: Concept JSONs are saved to disk BEFORE any database operations +- Key rotation via `KeyRotator` class distributing across 4 Gemini API keys + +### Embedding Details + +- **Primary**: TEI at cortex:8090 (bge-m3 model, 1024 dimensions, ~1,711 embeddings/sec) +- **Fallback**: Ollama at cortex:11434 (bge-m3 model, ~8 embeddings/sec) +- Batch size: 128 embeddings per TEI request +- Distance metric: Cosine similarity +- **CRITICAL**: Dimensions are 1024 (bge-m3), NOT 384. Getting this wrong creates silent failures. + +--- + +## 5. Directory Structure + +``` +/opt/recon/ # Application root + recon.py # CLI entry point + config.yaml # Central configuration + .env # Gemini API keys (4 keys) + requirements.txt # Python dependencies + PROJECT-BIBLE.md # This file + README.md # Quick-start reference + run-full-pipeline.sh # Background pipeline runner + + lib/ # Core modules + __init__.py + api.py # Flask web dashboard + API (port 8420) + crawler.py # Site crawler (sitemap + BFS link-following) + embedder.py # Concept -> vector embedding (TEI/Ollama -> Qdrant) + enricher.py # Text -> concept extraction (Gemini) + extractor.py # PDF -> text extraction (PyPDF2/pdftotext/OCR) + ingester.py # ARGUS intel feed intake + status.py # SQLite DB operations (catalogue + documents) + utils.py # Config, hashing, URL generation, logging + web_scraper.py # URL -> text extraction (trafilatura) + + scripts/ # Operational scripts + backup.sh # Automated backup to Contabo (cron every 6h) + rebuild_qdrant.py # Nuclear recovery: re-embed all concepts + validate.py # Pipeline consistency validation + + data/ # Pipeline data (on local disk) + recon.db # SQLite status database + text/ # Extracted text + {content_hash}/ + meta.json # Document metadata + page_0001.txt # Page text (4-digit, 1-indexed) + page_0002.txt + ... + concepts/ # Enriched concepts (**BACK THESE UP**) + {content_hash}/ + window_1.json # Concept JSON array (10-page window) + window_2.json + ... + intel/ # ARGUS intel feeds + + logs/ # Application logs + recon.log # Main rotating log + backup.log # Backup operation log + backup_cron.log # Cron backup log + + venv/ # Python virtual environment +``` + +--- + +## 6. Database Schema + +### SQLite (data/recon.db) + +Two tables in WAL mode with thread-local connections. + +#### catalogue + +| Column | Type | Description | +|--------|------|-------------| +| hash | TEXT PK | MD5 content hash | +| filename | TEXT | Original filename | +| path | TEXT | Full filesystem path | +| size_bytes | INTEGER | File size | +| source | TEXT | Top-level directory (e.g., "Survival-Companion-Library") | +| category | TEXT | Second-level directory (e.g., "Bushcraft") | +| status | TEXT | "catalogued" or "processed" | +| discovered_at | TEXT | ISO timestamp | + +#### documents + +| Column | Type | Description | +|--------|------|-------------| +| hash | TEXT PK | MD5 content hash | +| filename | TEXT | Original filename | +| path | TEXT | Full path or URL | +| size_bytes | INTEGER | File/content size | +| page_count | INTEGER | Number of text pages | +| book_title | TEXT | Gemini-extracted title | +| book_author | TEXT | Gemini-extracted author | +| status | TEXT | Pipeline status | +| pages_extracted | INTEGER | Pages extracted | +| concepts_extracted | INTEGER | Concepts generated | +| vectors_inserted | INTEGER | Vectors in Qdrant | +| error_message | TEXT | Last error (if failed) | +| retry_count | INTEGER | Failure retry count | +| created_at | TEXT | ISO timestamp | +| updated_at | TEXT | ISO timestamp | + +### Qdrant (cortex:6333) + +Collection: `recon_knowledge` + +| Field | Type | Description | +|-------|------|-------------| +| vector | float[1024] | BGE-M3 embedding | +| doc_hash | keyword | Links to SQLite document | +| filename | keyword | Source filename | +| book_title | keyword | Document title | +| book_author | keyword | Author name | +| source_type | keyword | "document", "web", or "intel_feed" | +| download_url | keyword | files.echo6.co URL or source URL | +| content | text | Concept text (searchable) | +| summary | text | Concept summary | +| title | keyword | Concept title | +| domain | keyword | Knowledge domain | +| subdomain | keyword | Knowledge subdomain | +| keywords | keyword[] | Concept keywords | +| skill_level | keyword | beginner/intermediate/advanced/expert | +| key_facts | text[] | Key facts list | +| scenario_applicable | text[] | Applicable scenarios | +| cross_domain_tags | keyword[] | Cross-references | +| chapter | keyword | Source chapter | +| page_ref | keyword | Source page reference | +| notes | text | Additional notes | +| _window | integer | Source window number | +| _start_page | integer | Starting page in document | +| verification_status | keyword | "unverified" (default) | +| credibility_score | float | 0.7 (default) | +| language | keyword | "en" (default) | + +--- + +## 7. CLI Reference + +``` +recon [options] +``` + +| Command | Description | Key Options | +|---------|-------------|-------------| +| `scan` | Scan library, catalogue new PDFs | `--path` | +| `queue` | Queue catalogued docs for processing | `--hash`, `--source`, `--category`, `--limit` | +| `extract` | Extract text from queued PDFs | `--workers` | +| `enrich` | Enrich extracted text via Gemini | `--workers`, `--limit` | +| `embed` | Embed concepts into Qdrant | `--workers`, `--limit` | +| `run` | Full pipeline (extract->enrich->embed) | `--workers`, `--enrich-workers`, `--limit` | +| `status` | Show pipeline status counts | | +| `catalogue` | Browse catalogue | `--sources`, `--categories`, `--source`, `--limit` | +| `failures` | Show failed documents | `--retry` | +| `search` | Semantic search | `query`, `--limit` | +| `upload` | Upload PDFs | `--file`, `--dir`, `--category` | +| `ingest-url` | Ingest web content | `url`, `--file`, `--category`, `--process` | +| `crawl` | Crawl a site | `url`, `--category`, `--include`, `--exclude`, `--max-pages`, `--dry-run`, `--process` | +| `validate` | Check pipeline consistency | `--deep` | +| `rebuild` | Rebuild Qdrant from concept JSONs | | +| `serve` | Start web dashboard (port 8420) | | +| `ingest` | Ingest ARGUS intel JSON | `--file`, `--directory` | + +### Common Workflows + +```bash +# Full library processing +recon scan && recon queue && recon run + +# Ingest a single web page with full processing +recon ingest-url "https://example.com/article" --category "Reference" --process + +# Dry-run crawl to preview URLs +recon crawl "https://docs.example.com" --include /docs/ --dry-run + +# Full crawl with processing +recon crawl "https://docs.example.com" --include /docs/ --category "Reference" --process + +# Upload a PDF +recon upload --file /path/to/document.pdf --category "Technical" + +# Check what failed and retry +recon failures +recon failures --retry +``` + +--- + +## 8. Web Dashboard + +### URL + +``` +http://100.64.0.24:8420 +``` + +### Pages + +| Route | Page | Description | +|-------|------|-------------| +| `/` | Dashboard | Knowledge base overview: document/concept/vector counts, source table, domain distribution bars, skill level breakdown, Qdrant health, recent completions, pipeline status | +| `/search` | Search | Semantic search with score bars, Web/PDF badges, download links | +| `/catalogue` | Catalogue | Browse all catalogued PDFs with source/category filters | +| `/upload` | Upload | PDF upload form with category datalist, recent uploads table | +| `/web-ingest` | Web Ingest | Two tabs: Single/Batch URL ingest, Site Crawl with preview | +| `/failures` | Failures | Failed documents with error messages and retry button | + +### API Endpoints + +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/api/search?q=...&limit=N` | Semantic search | +| GET | `/api/catalogue?source=...&limit=N` | Browse catalogue | +| GET | `/api/knowledge-stats` | Dashboard aggregation (totals, sources, domains, skills, Qdrant health) | +| POST | `/api/upload` | Upload PDF (multipart: file + category) | +| GET | `/api/upload//status` | Check upload processing status | +| GET | `/api/upload/categories` | List available categories | +| POST | `/api/ingest-url` | Ingest single URL (json: url, category, process) | +| POST | `/api/ingest-urls` | Ingest multiple URLs (json: urls, category, process) | +| POST | `/api/crawl` | Crawl a site (json: url, category, include, exclude, max_pages, dry_run) | +| GET | `/api/crawl//status` | Poll crawl/pipeline progress | +| POST | `/api/failures/retry` | Re-queue all failed documents | + +### Dashboard Features + +- **Auto-refresh**: Every 30 seconds via JavaScript fetch +- **Knowledge cards**: Total documents, concepts, vectors, pages +- **Source table**: Per-source breakdown with document/concept/vector counts and PDF/WEB type badges +- **Domain distribution**: Horizontal bars showing top knowledge domains +- **Skill level breakdown**: beginner/intermediate/advanced/expert percentages +- **Qdrant health**: Connection status, points count, segments +- **Pipeline status**: Compact display of documents in each stage +- **Crawl polling**: Real-time stage tracking (ingesting -> enriching -> embedding) + +--- + +## 9. Concept JSON Schema + +Each window file (`data/concepts/{hash}/window_N.json`) contains a JSON array of concept objects: + +```json +[ + { + "title": "Water Purification Methods", + "content": "Detailed text about the concept...", + "summary": "Brief summary of the concept", + "domain": "Survival", + "subdomain": "Water", + "keywords": ["purification", "filtration", "boiling"], + "skill_level": "beginner", + "key_facts": ["Boiling kills 99.9% of pathogens", "..."], + "scenario_applicable": ["wilderness survival", "disaster preparedness"], + "cross_domain_tags": ["health", "camping"], + "chapter": "Chapter 3", + "page_ref": "pp. 45-48", + "notes": "Additional context or caveats", + "_window": 1, + "_start_page": 1 + } +] +``` + +--- + +## 10. Web Ingestion + +### Single URL + +```bash +recon ingest-url "https://example.com/article" --category "Reference" --process +``` + +Or via API: +```bash +curl -X POST http://100.64.0.24:8420/api/ingest-url \ + -H "Content-Type: application/json" \ + -d '{"url": "https://example.com/article", "category": "Reference", "process": true}' +``` + +### Site Crawl + +```bash +# Preview what would be crawled +recon crawl "https://docs.example.com" --include /docs/ --dry-run + +# Full crawl +recon crawl "https://docs.example.com" --include /docs/ --category "Reference" --process +``` + +### How It Works + +1. **URL discovery** (crawler.py): + - Tries sitemap.xml first (preferred, finds all pages) + - Falls back to BFS link-following if no sitemap + - Filters by include/exclude patterns + +2. **Content extraction** (web_scraper.py): + - Uses trafilatura for clean text extraction + - Chunks into ~2,000-word pages + - Same output format as PDF extractor: `data/text/{hash}/page_NNNN.txt` + - Content hash is MD5 of extracted text (deduplication) + +3. **Pipeline integration**: + - Web content enters at `extracted` status (no PDF extraction needed) + - Enrichment and embedding proceed identically to PDF content + - Qdrant vectors get `source_type: "web"` and `download_url` pointing to source URL + +--- + +## 11. Configuration Reference + +### config.yaml + +```yaml +# Root path for the PDF library (NFS mount from pi-nas) +library_root: /mnt/library + +processing: + extract_workers: 4 # Concurrent PDF extraction threads + enrich_workers: 16 # Concurrent Gemini enrichment threads (4 keys x 4) + embed_workers: 4 # Concurrent embedding threads + enrich_window_size: 5 # Pages per enrichment window (sent to Gemini) + embed_batch_size: 500 # Vectors per Qdrant upsert batch + rate_limit_delay: 0.1 # Delay between Gemini API calls (seconds) + max_retries: 5 # Max retries for failed documents + +embedding: + backend: tei # "tei" (primary, ~1,711 emb/sec) or "ollama" (fallback, ~8 emb/sec) + tei_host: 100.64.0.14 # TEI server (cortex) + tei_port: 8090 # TEI HTTP port + ollama_host: 100.64.0.14 # Ollama server (cortex) — fallback only + ollama_port: 11434 # Ollama HTTP port + model: bge-m3 # Embedding model name + dimensions: 1024 # CRITICAL: bge-m3 is 1024-dim, NOT 384 + batch_size: 128 # Embeddings per TEI batch request + +vector_db: + host: 100.64.0.14 # Qdrant server (cortex) + port: 6333 # Qdrant HTTP port + collection: recon_knowledge # Collection name + +gemini: + model: gemini-2.0-flash # Gemini model for enrichment + response_mime_type: application/json # Force JSON output + +web: + port: 8420 # Dashboard HTTP port + host: 0.0.0.0 # Bind to all interfaces + +paths: + base: /opt/recon # Application root + data: /opt/recon/data # Data directory + text: /opt/recon/data/text # Extracted text output + concepts: /opt/recon/data/concepts # Enriched concept JSONs + intel: /opt/recon/data/intel # ARGUS intel feeds + logs: /opt/recon/logs # Log files + db: /opt/recon/data/recon.db # SQLite database + +book_server: + base_url: https://files.echo6.co # Public URL prefix for PDF downloads + strip_prefix: /mnt/library # Path prefix to strip when generating URLs + +upload_paths: # Category -> filesystem path mapping for uploads + Survival Reference: /mnt/library/Survival-Companion-Library/Uploads + Military Doctrine: /mnt/library/Army_Pubs/Uploads + Gaming: /mnt/library/Gaming + Reference: /mnt/library/Reference + Technical: /mnt/library/Technical + default: /mnt/library # Fallback for unknown categories + +web_scraper: + words_per_page: 2000 # Target words per page chunk + fetch_timeout: 30 # HTTP request timeout (seconds) + rate_limit_delay: 1.0 # Delay between URL fetches (seconds) + max_batch_size: 50 # Max URLs per batch ingest + user_agent: "Mozilla/5.0 (compatible; RECON/1.0)" + +crawler: + user_agent: "Mozilla/5.0 (compatible; RECON/1.0)" + fetch_timeout: 30 # HTTP request timeout (seconds) + rate_limit_delay: 1.0 # Delay between page fetches (seconds) + max_pages: 500 # Max pages to discover per crawl + max_depth: 3 # Max link-following depth (BFS only) + default_exclude: # URL patterns to always skip + - /search + - /404 + - /login + - /signup + - /auth/ + - /api/ + - /assets/ + - /static/ +``` + +### .env + +``` +GEMINI_KEY_1= +GEMINI_KEY_2= +GEMINI_KEY_3= +GEMINI_KEY_4= +``` + +Four Gemini API keys rotated across 16 enrichment workers via `KeyRotator`. + +--- + +## 12. Aurora RAG Integration + +Aurora is the RAG-enabled AI assistant running on OpenWebUI (ai.echo6.co). + +### How It Works + +1. User asks a question in OpenWebUI +2. Aurora's OpenWebUI function/filter embeds the query via TEI (cortex:8090) +3. Searches Qdrant `recon_knowledge` collection for similar concepts +4. Top results are injected into the prompt as context +5. JOSIEFIED Qwen3 8B generates an answer with citations +6. Citations include `download_url` links (PDF files via files.echo6.co, web content via source URL) + +### Key Components + +- **Embedding**: Same TEI endpoint + bge-m3 model as RECON pipeline (ensures vector compatibility) +- **Search**: Cosine similarity, top-5 results by default +- **LLM**: `goekdenizguelmez/JOSIEFIED-Qwen3:8b` on Ollama (cortex:11434) +- **Citations**: Each result includes `download_url` — either `https://files.echo6.co/...` for PDFs or the original URL for web content + +--- + +## 13. Backup & Recovery + +### Automated Backups + +**Script**: `/opt/recon/scripts/backup.sh` +**Destination**: Contabo VPS (`root@100.64.0.1:/opt/backups/recon/`) +**Schedule** (cron): +- Every 6 hours: Full backup (concepts, text, DB, config, intel) +- Every 2 hours (off-hours): SQLite DB snapshot only + +### What's Backed Up + +| Component | Size | Priority | Notes | +|-----------|------|----------|-------| +| data/concepts/ | ~11M | **CRITICAL** | $130+ of Gemini API work | +| data/text/ | ~203M | High | Hours to regenerate | +| data/recon.db | ~6.5M | **CRITICAL** | All pipeline state | +| config.yaml + .env | ~2K | Important | Configuration | +| data/intel/ | ~4K | Low | Intel feed data | + +### What's NOT Backed Up + +- **Qdrant vectors**: Rebuilt from concept JSONs in ~10 minutes via `recon rebuild` +- **PDF library**: Lives on pi-nas NFS, backed up separately +- **venv/**: Recreated from requirements.txt + +### Recovery Procedures + +```bash +# Restore from backup +scp -r root@100.64.0.1:/opt/backups/recon/concepts/ /opt/recon/data/concepts/ +scp -r root@100.64.0.1:/opt/backups/recon/text/ /opt/recon/data/text/ +scp root@100.64.0.1:/opt/backups/recon/recon_LATEST.db /opt/recon/data/recon.db + +# Rebuild Qdrant vectors from concept JSONs +cd /opt/recon && source venv/bin/activate +python3 scripts/rebuild_qdrant.py +# Type REBUILD when prompted +``` + +--- + +## 14. Embedding Performance + +### TEI (Primary) vs Ollama (Fallback) + +| Metric | TEI (cortex:8090) | Ollama (cortex:11434) | +|--------|-------------------|----------------------| +| Speed | ~1,711 emb/sec | ~8 emb/sec | +| Model | bge-m3 | bge-m3 | +| Dimensions | 1024 | 1024 | +| Batch size | 128 | 1 | +| Cosine similarity | 0.999900 | 0.999900 | + +TEI is ~214x faster than Ollama for embeddings. Always use TEI unless it's down. + +### Qdrant Configuration + +- Collection: `recon_knowledge` +- Distance: Cosine +- HNSW indexing threshold: 20,000 (below this, brute-force search is used) +- Current state: Brute-force (under 20K vectors) — this is normal and performant at current scale + +--- + +## 15. Content Hashing + +- **PDF content**: `MD5(file_bytes)` — stable across renames, detects exact duplicates +- **Web content**: `MD5(extracted_text)` — deduplicates by content, not URL +- Hash is used as the primary key in both SQLite tables and as the directory name for text/concept storage + +--- + +## 16. Source Type Handling + +| Source | Path Format | source_type | download_url | Badge | +|--------|-------------|-------------|--------------|-------| +| PDF | `/mnt/library/...` | document | `https://files.echo6.co/...` | PDF | +| Web | `https://...` | web | Original URL | Web | +| Intel | JSON feed | intel_feed | — | — | + +The `generate_download_url()` function in utils.py handles the routing: +- URLs starting with `http://` or `https://` are returned as-is +- File paths are converted to `files.echo6.co` URLs + +--- + +## 17. Lessons Learned + +### RECON Rebuild Lessons + +1. **Verify infrastructure before writing code.** Check Qdrant, TEI, Ollama connectivity first. +2. **Dimensions are 1024, NOT 384.** BGE-M3 uses 1024-dimensional vectors. This caused silent failures in early builds. +3. **TEI >> Ollama for embeddings.** 1,711 vs 8 embeddings/sec. A 214x speedup that makes batch processing viable. +4. **Dynamic discovery over hardcoded paths.** Let the pipeline discover what's on disk rather than maintaining static file lists. +5. **Web content uses the same pipeline.** After text extraction, web and PDF content follow identical enrichment and embedding paths. +6. **Sitemap > link-following.** Sitemaps discover all pages reliably; BFS link-following misses orphaned pages and is slower. +7. **Save to disk before DB operations.** Concept JSONs are written to disk first, then the database is updated. This means recovery is always possible from the JSON files. +8. **NFS over large file sets is slow.** Scanning 13K PDFs over NFS takes ~30 minutes due to MD5 hashing over the network. Plan accordingly. + +### Operational Gotchas + +- `recon scan` can appear stuck on large PDFs over NFS — it's hashing, not hung +- Some PDFs have corrupt metadata that crashes PyPDF2 — the extractor catches this and falls back +- Gemini rate limits hit with 16 workers — the `KeyRotator` distributes across 4 keys to mitigate +- `iptables-persistent` hangs on interactive prompts in LXC containers — use manual persistence +- The recon LXC has no tmux/screen — use `nohup` for long-running background tasks + +--- + +## 18. Monitoring + +### Pipeline Status + +```bash +# Quick status +recon status + +# Dashboard +http://100.64.0.24:8420 + +# Tail logs +tail -f /opt/recon/logs/recon.log + +# Pipeline run log (when running full background pipeline) +tail -f /opt/recon/pipeline.log +``` + +### Health Checks + +```bash +# Qdrant +curl -s http://100.64.0.14:6333/collections/recon_knowledge | python3 -m json.tool + +# TEI +curl -s http://100.64.0.14:8090/info + +# Ollama +curl -s http://100.64.0.14:11434/api/tags | python3 -m json.tool + +# NFS mount +df -h /mnt/library + +# Backup logs +tail -20 /opt/recon/logs/backup.log +``` + +### Validation + +```bash +# Quick validation +recon validate + +# Deep validation (checks all files on disk) +recon validate --deep +``` + +--- + +## 19. Current State + +*As of 2026-02-16* + +### Pipeline Progress + +| Status | Count | +|--------|-------| +| Catalogued | 10,162 | +| Queued | 8,982 | +| Extracted | 872 | +| Complete | 302 | +| Failed | 2 | + +### Vector Database + +- Qdrant points: 4,661 (3,144 PDF + 1,517 web) +- Segments: 8 +- Indexing: Brute-force (under 20K threshold) + +### Active Processing + +Full pipeline running in background via `nohup` — extracting through the 8,982 queued documents. Expected to take ~40 hours for full extract -> enrich -> embed cycle. + +### Backups + +- Schedule: Every 6 hours (full) + every 2 hours (DB only) +- Destination: Contabo VPS (`/opt/backups/recon/`) +- Last verified: 2026-02-16 (220M total backup size) + +--- + +## 20. Dependencies + +### System Packages + +- Python 3.11+ +- pdftotext (poppler-utils) +- tesseract-ocr +- sqlite3 + +### Python Packages (key) + +| Package | Version | Purpose | +|---------|---------|---------| +| Flask | 3.1.2 | Web dashboard | +| google-generativeai | 0.8.6 | Gemini API for enrichment | +| qdrant-client | 1.16.2 | Vector database client | +| PyPDF2 | 3.0.1 | PDF text extraction | +| trafilatura | 2.0.0 | Web content extraction | +| beautifulsoup4 | 4.14.3 | HTML parsing for crawler | +| lxml | 6.0.2 | XML/HTML parsing | +| pytesseract | 0.3.13 | OCR fallback | +| requests | 2.32.5 | HTTP client | +| PyYAML | 6.0.3 | Config file parsing | + +Full list in `requirements.txt`. diff --git a/README.md b/README.md new file mode 100644 index 0000000..b7518b9 --- /dev/null +++ b/README.md @@ -0,0 +1,89 @@ +# RECON -- Knowledge Extraction Pipeline + +Extracts structured knowledge from PDFs and web content into a Qdrant vector database for RAG retrieval by Aurora. + +## Quick Start + +```bash +# Activate +cd /opt/recon && source venv/bin/activate + +# Scan library for new PDFs +recon scan + +# Queue and process +recon queue +recon extract +recon enrich +recon embed + +# Or run full pipeline +recon run + +# Ingest a web page +recon ingest-url "https://example.com/article" --category "Category" --process + +# Crawl an entire docs site +recon crawl "https://docs.example.com" --include /docs/ --category "Category" --process + +# Upload a PDF +recon upload --file /path/to/document.pdf --category "Category" + +# Search +recon search "water purification methods" + +# Check status +recon status +recon failures +``` + +## Dashboard + +http://100.64.0.24:8420 + +## Services + +| Service | Location | Purpose | +|---------|----------|---------| +| RECON Dashboard | recon:8420 | Pipeline management + API | +| Qdrant | cortex:6333 | Vector database | +| TEI | cortex:8090 | Embeddings (1,711/sec) | +| Ollama | cortex:11434 | Chat + fallback embeddings | +| OpenWebUI | cortex:8080 (ai.echo6.co) | Aurora chat with RAG | +| File Server | recon:8888 (files.echo6.co) | PDF downloads | + +## Key Paths + +| Path | Contents | +|------|----------| +| /opt/recon/ | Application code | +| /opt/recon/data/concepts/ | Gemini extractions (**CRITICAL -- back these up**) | +| /opt/recon/data/text/ | Extracted text | +| /opt/recon/data/recon.db | SQLite status DB | +| /mnt/library/ | PDF library (NFS from pi-nas) | + +## Backups + +Automated every 6 hours to Contabo VPS via `/opt/recon/scripts/backup.sh`. +Concept JSONs are the most valuable data ($130+ of Gemini API work). +Qdrant is NOT backed up -- rebuilt from JSONs in ~10 minutes via `recon rebuild`. + +## Monitoring + +```bash +# Pipeline status +recon status + +# Tail logs +tail -f /opt/recon/logs/recon.log + +# Pipeline run log +tail -f /opt/recon/pipeline.log + +# Validate consistency +recon validate --deep +``` + +## Full Documentation + +See [PROJECT-BIBLE.md](PROJECT-BIBLE.md) for complete system documentation. diff --git a/api.py b/api.py new file mode 100644 index 0000000..fee6d3b --- /dev/null +++ b/api.py @@ -0,0 +1,348 @@ +import json +import os + +import requests as http_requests +from flask import Flask, request, jsonify, redirect +from qdrant_client import QdrantClient +from qdrant_client.models import Filter, FieldCondition, MatchValue + +from .utils import get_config, content_hash, setup_logging +from .status import StatusDB + +logger = setup_logging('recon.api') + +app = Flask(__name__) + +HTML_TEMPLATE = """ + + +RECON + + + + +
+

RECON

+
Knowledge Base Management System
+
+ +
+ {{CONTENT}} +
+ +""" + + +def render(content): + return HTML_TEMPLATE.replace('{{CONTENT}}', content) + + +@app.route('/') +def dashboard(): + db = StatusDB() + counts = db.get_status_counts() + cat = counts.get('catalogue', {}) + doc = counts.get('documents', {}) + + total_cat = sum(cat.values()) + total_doc = sum(doc.values()) + complete = doc.get('complete', 0) + failed = doc.get('failed', 0) + + stats = f""" +
+
Catalogued PDFs
{total_cat}
+
In Pipeline
{total_doc}
+
Complete
{complete}
+
Failed
{failed}
+
+

Pipeline Status

+ + + """ + for status in ['queued', 'extracting', 'extracted', 'enriching', 'enriched', 'embedding', 'complete', 'failed']: + count = doc.get(status, 0) + stats += f'\n' + + stats += "
StatusCount
{status}{count}
" + + sources = db.source_breakdown() + if sources: + stats += '

Sources

' + for s in sources: + size_mb = (s.get('total_bytes', 0) or 0) / (1024 * 1024) + stats += f"" + stats += "
SourceCountSize
{s['source']}{s['count']}{size_mb:.1f} MB
" + + return render(stats) + + +@app.route('/search') +def search_page(): + query = request.args.get('q', '') + if not query: + content = """ +

Semantic Search

+
+ +
+

Enter a query to search across all embedded concepts.

+ """ + return render(content) + + config = get_config() + limit = int(request.args.get('limit', 20)) + source_filter = request.args.get('source_type', None) + + try: + url = f"http://{config['embedding']['host']}:{config['embedding']['port']}/api/embed" + resp = http_requests.post(url, json={ + "model": config['embedding']['model'], + "input": query + }, timeout=120) + resp.raise_for_status() + query_vector = resp.json()['embeddings'][0] + + qdrant = QdrantClient( + host=config['vector_db']['host'], + port=config['vector_db']['port'], + timeout=60 + ) + + search_filter = None + if source_filter: + search_filter = Filter(must=[ + FieldCondition(key="source_type", match=MatchValue(value=source_filter)) + ]) + + results = qdrant.query_points( + collection_name=config['vector_db']['collection'], + query=query_vector, + limit=limit, + query_filter=search_filter + ).points + + content = f""" +

Results for: {query}

+
+ +
+

{len(results)} results

+ """ + + for r in results: + p = r.payload + title = p.get('title', 'Untitled') + summary = p.get('summary', p.get('content', '')[:200]) + score = r.score + domains = p.get('domain', []) + book = p.get('book_title', p.get('filename', '')) + source_type = p.get('source_type', 'document') + + domain_tags = ''.join(f'{d}' for d in (domains if isinstance(domains, list) else [])) + + content += f""" +
+ {score:.4f} +
{title}
+
{book} | {source_type} | {p.get('skill_level', 'unknown')}
+
{summary}
+
{domain_tags}
+
+ """ + + return render(content) + + except Exception as e: + return render(f'

Search error: {e}

') + + +@app.route('/catalogue') +def catalogue_page(): + db = StatusDB() + source = request.args.get('source', None) + category = request.args.get('category', None) + limit = int(request.args.get('limit', 100)) + + docs = db.get_all_documents(source=source, category=category, limit=limit) + + content = '

Document Catalogue

' + + sources = db.get_sources() + if sources: + content += '
' + content += 'All' + for s in sources: + content += f'{s}' + content += '
' + + content += """ + """ + + for d in docs: + status = d.get('status', 'unknown') + content += f""" + + + + + + + """ + + content += "
FilenameSourceStatusPagesConceptsVectors
{d.get('filename', '?')}{d.get('source', '')}{status}{d.get('pages_extracted', 0)}{d.get('concepts_extracted', 0)}{d.get('vectors_inserted', 0)}
" + return render(content) + + +@app.route('/failures') +def failures_page(): + db = StatusDB() + failures = db.get_failures() + + content = '

Failed Documents

' + + if not failures: + content += '

No failures.

' + return render(content) + + content += '' + for f in failures: + content += f""" + + + + + """ + + content += "
FilenameErrorRetriesActions
{f.get('filename', '?')}{f.get('error_message', 'unknown')[:100]}{f.get('retry_count', 0)}
+ +
" + return render(content) + + +@app.route('/api/search', methods=['POST']) +def api_search(): + config = get_config() + data = request.get_json() + if not data or 'query' not in data: + return jsonify({'error': 'Missing query'}), 400 + + query = data['query'] + limit = data.get('limit', 20) + source_type = data.get('source_type', None) + + try: + url = f"http://{config['embedding']['host']}:{config['embedding']['port']}/api/embed" + resp = http_requests.post(url, json={ + "model": config['embedding']['model'], + "input": query + }, timeout=120) + resp.raise_for_status() + query_vector = resp.json()['embeddings'][0] + + qdrant = QdrantClient( + host=config['vector_db']['host'], + port=config['vector_db']['port'], + timeout=60 + ) + + search_filter = None + if source_type: + search_filter = Filter(must=[ + FieldCondition(key="source_type", match=MatchValue(value=source_type)) + ]) + + results = qdrant.query_points( + collection_name=config['vector_db']['collection'], + query=query_vector, + limit=limit, + query_filter=search_filter + ).points + + return jsonify({ + 'query': query, + 'results': [ + { + 'score': r.score, + 'payload': r.payload + } + for r in results + ] + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/api/status') +def api_status(): + db = StatusDB() + return jsonify(db.get_status_counts()) + + +@app.route('/api/retry/', methods=['POST']) +def api_retry(file_hash): + db = StatusDB() + db.increment_retry(file_hash) + return redirect('/failures') + + +@app.route('/api/ingest', methods=['POST']) +def api_ingest(): + from .ingester import ingest_intel + data = request.get_json() + if not data: + return jsonify({'error': 'No JSON body'}), 400 + + config = get_config() + result = ingest_intel(data, config) + if result is not None: + return jsonify({'intel_id': result}) + return jsonify({'error': 'Ingestion failed'}), 500 + + +def run_server(): + config = get_config() + host = config['web']['host'] + port = config['web']['port'] + logger.info(f"Starting RECON web dashboard on {host}:{port}") + app.run(host=host, port=port, debug=False) diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..1f5d1b0 --- /dev/null +++ b/config.yaml @@ -0,0 +1,440 @@ +# RECON Configuration +# See PROJECT-BIBLE.md Section 11 for full documentation + +# Root path for the PDF library (NFS mount from pi-nas) +library_root: /mnt/library + +processing: + max_pdf_size_mb: 2000 # Raised from 200MB default for large scanned books + extract_workers: 4 # Concurrent PDF extraction threads + enrich_workers: 16 # Concurrent Gemini enrichment threads (4 keys x 4) + embed_workers: 4 # Concurrent embedding threads + enrich_window_size: 5 # Pages per enrichment window (sent to Gemini) + embed_batch_size: 500 # Vectors per Qdrant upsert batch + rate_limit_delay: 0.1 # Delay between Gemini API calls (seconds) + max_retries: 5 # Max retries for failed documents + extract_timeout: 1800 # Max seconds per document extraction (30 min, allows vision OCR) + page_timeout: 30 # Max seconds per page extraction + enrich_max_retries: 5 # Max retries per enrichment window + enrich_base_delay: 5.0 # Base backoff delay (seconds) — ~5s, 10s, 20s, 40s, 80s + enrich_max_delay: 120.0 # Maximum backoff delay cap (seconds) + +embedding: + backend: tei # "tei" (primary, ~1,711 emb/sec) or "ollama" (fallback, ~8 emb/sec) + tei_host: 100.64.0.14 # TEI server (cortex) + tei_port: 8090 # TEI HTTP port + ollama_host: 100.64.0.14 # Ollama server (cortex) — fallback only + ollama_port: 11434 # Ollama HTTP port + model: bge-m3 # Embedding model name + dimensions: 1024 # CRITICAL: bge-m3 is 1024-dim, NOT 384 + batch_size: 128 # Embeddings per TEI batch request + +sparse_embedding: + enabled: true + host: 100.64.0.14 # Sparse embedding service (cortex) + port: 8091 # Sparse embedding HTTP port + +vector_db: + host: 100.64.0.14 # Qdrant server (cortex) + port: 6333 # Qdrant HTTP port + collection: recon_knowledge_hybrid # Collection name + +gemini: + model: gemini-2.0-flash # Gemini model for enrichment + response_mime_type: application/json # Force JSON output from Gemini + +web: + port: 8420 # Dashboard HTTP port + host: 0.0.0.0 # Bind address (all interfaces) + +paths: + base: /opt/recon # Application root + data: /opt/recon/data # Data directory + text: /opt/recon/data/text # Extracted text output (data/text/{hash}/page_NNNN.txt) + concepts: /opt/recon/data/concepts # Enriched concept JSONs (data/concepts/{hash}/window_N.json) + intel: /opt/recon/data/intel # ARGUS intel feeds + logs: /opt/recon/logs # Log files + db: /opt/recon/data/recon.db # SQLite database (WAL mode) + +book_server: + base_url: https://files.echo6.co # Public URL prefix for PDF downloads + strip_prefix: /mnt/library # Path prefix stripped when generating download URLs + +upload_paths: # Category -> filesystem path mapping for uploads + Survival Reference: /mnt/library/Survival-Companion-Library/Uploads + Military Doctrine: /mnt/library/Army_Pubs/Uploads + Gaming: /mnt/library/Gaming + Reference: /mnt/library/Reference + Technical: /mnt/library/Technical + default: /mnt/library # Fallback for unknown categories + +web_scraper: + words_per_page: 2000 # Target words per page chunk for web content + fetch_timeout: 30 # HTTP request timeout (seconds) + rate_limit_delay: 1.0 # Delay between URL fetches (seconds) + max_batch_size: 50 # Max URLs per batch ingest + user_agent: "Mozilla/5.0 (compatible; RECON/1.0)" + +crawler: + user_agent: "Mozilla/5.0 (compatible; RECON/1.0)" + fetch_timeout: 30 # HTTP request timeout (seconds) + rate_limit_delay: 1.0 # Delay between page fetches (seconds) + max_pages: 500 # Max pages to discover per crawl + max_depth: 3 # Max link-following depth (BFS only, not sitemap) + inter_site_cooldown: 30 # Seconds to wait between crawling different sites + recrawl_interval_days: 7 # Skip sites crawled within this many days + + default_exclude: # URL patterns always excluded from crawling + - /search + - /404 + - /login + - /signup + - /auth/ + - /api/ + - /assets/ + - /static/ + - /cart + - /checkout + - /account + - /register + - /subscribe + - /membership + - /shop + - /store + - /product + - /wp-admin + - /feed + - /wp-json + - /xmlrpc + - /.well-known + - /cdn-cgi + + # ─── Crawl Targets ───────────────────────────────────────────── + # Sites are crawled by the scheduler loop in tier order (1 first). + # Per-site delay overrides global rate_limit_delay for that site. + # Per-site max_pages/max_depth override global defaults. + + # Disabled 2026-04-14 for refactor — see refactored-recon repo for context + sites: [] + + # sites: + # + # # ═══ TIER 1 — Free, authoritative, high-density ═══ + # + # - url: https://hesperian.org/all-hesperian-health-guides + # category: Medical + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Free health guides — WTIND, midwives, community health" + # + # - url: https://swsbm.com + # category: Medical + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Michael Moore's entire free clinical herbal library — PDFs" + # + # - url: https://swsbm.henriettesherbal.com + # category: Medical + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Mirror of Moore's library — grab both" + # + # - url: https://nchfp.uga.edu + # category: Sustainment Systems + # max_depth: 3 + # delay: 2.0 + # tier: 1 + # notes: "USDA canning/preservation safety authority" + # + # - url: https://extension.uidaho.edu + # category: Foundational Skills + # max_depth: 3 + # delay: 2.0 + # tier: 1 + # notes: "Idaho-specific — soil, water, crops, livestock" + # + # - url: https://extension.usu.edu + # category: Foundational Skills + # max_depth: 3 + # delay: 2.0 + # tier: 1 + # notes: "Utah State — Idaho-adjacent climate" + # + # - url: https://attra.ncat.org + # category: Sustainment Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "ATTRA sustainable ag — hundreds of free publications" + # + # - url: https://pfaf.org + # category: Sustainment Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Plants For A Future — 7,000+ edible/medicinal plant profiles" + # + # - url: https://eattheweeds.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Green Deane — 1,000+ foraging plant articles" + # + # - url: https://lowtechmagazine.com + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Exceptional low-tech systems analysis" + # + # - url: https://appropedia.org + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Appropriate technology wiki" + # + # - url: https://journeytoforever.org + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "VITA manuals, biodiesel, biogas, hand tools archive" + # + # - url: https://cd3wd.com + # category: Off-Grid Systems + # max_depth: 2 + # delay: 3.0 + # tier: 1 + # notes: "1,050+ appropriate technology eBooks — index pages only" + # + # - url: https://practicalselfreliance.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Ashley Adamant — foraging, preservation, homesteading" + # + # - url: https://open.oregonstate.edu/permaculture + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Millison's free permaculture textbook" + # + # - url: https://open.oregonstate.edu/permaculturedesign + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Millison's advanced permaculture textbook" + # + # - url: https://mushroomexpert.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 3.0 + # tier: 1 + # notes: "Michael Kuo — mushroom ID, taxonomy, regional coverage" + # + # # ═══ TIER 2 — High value, second pass ═══ + # + # - url: https://motherearthnews.com + # category: Foundational Skills + # max_depth: 2 + # max_pages: 200 + # delay: 8.0 + # tier: 2 + # notes: "50 years of homesteading archive — large commercial site, be polite" + # + # - url: https://permacultureresearchinstitute.com + # category: Off-Grid Systems + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Geoff Lawton — articles, case studies" + # + # - url: https://learnyourland.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Adam Haritan — foraging articles" + # + # - url: https://herbswithRosalee.com + # category: Medical + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Rosalee de la Foret — clinical herbalism articles" + # + # - url: https://commonwealthherbs.com + # category: Medical + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Katja and Ryn — clinical herbalism" + # + # - url: https://soilfoodweb.com + # category: Off-Grid Systems + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Elaine Ingham soil biology — archive before it goes dark" + # + # - url: https://rocketstoves.com + # category: Off-Grid Systems + # max_depth: 3 + # delay: 5.0 + # tier: 2 + # notes: "Ianto Evans — rocket mass heater designs and PDFs" + # + # - url: https://farmsteadmeatsmith.com + # category: Sustainment Systems + # max_depth: 2 + # delay: 5.0 + # tier: 2 + # notes: "Brandon Sheard — butchering articles (free content only)" + # + # - url: https://deeranddeerhunting.com + # category: Sustainment Systems + # max_depth: 2 + # delay: 5.0 + # tier: 2 + # notes: "Field dressing, processing, hunting technique library" + # + # # ═══ TIER 3 — Government (authoritative) ═══ + # + # - url: https://plants.usda.gov + # category: Sustainment Systems + # max_depth: 2 + # delay: 2.0 + # tier: 3 + # notes: "USDA native plant database" + # + # - url: https://ars.usda.gov + # category: Sustainment Systems + # max_depth: 2 + # delay: 2.0 + # tier: 3 + # notes: "USDA Agricultural Research publications" + # + # - url: https://nrcs.usda.gov + # category: Off-Grid Systems + # max_depth: 2 + # delay: 2.0 + # tier: 3 + # notes: "Soil surveys, conservation practice standards" + # + # - url: https://ready.gov + # category: Scenario Playbooks + # max_depth: 3 + # delay: 2.0 + # tier: 3 + # notes: "FEMA emergency preparedness guides" + # + # - url: https://emergency.cdc.gov + # category: Medical + # max_depth: 3 + # delay: 2.0 + # tier: 3 + # notes: "Public health emergency references" + # + # - url: https://agri.idaho.gov + # category: Foundational Skills + # max_depth: 2 + # delay: 2.0 + # tier: 3 + # notes: "Idaho Dept of Agriculture — local relevance" + # + # - url: https://driveonwood.com + # category: Off-Grid Systems + # max_depth: 3 + # delay: 3.0 + # tier: 3 + # notes: "Wood gasification — FEMA manual + modern improvements" + # + # # ═══ TIER 4 — Selective scrape (specific sections only) ═══ + # + # - url: https://richsoil.com + # category: Off-Grid Systems + # max_depth: 2 + # delay: 5.0 + # tier: 4 + # notes: "Paul Wheaton — rocket mass heaters, natural building" + # + # - url: https://wildfoodgirl.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 5.0 + # tier: 4 + # notes: "Colorado foraging — Mountain West species" + # + # - url: https://foragersharvest.com + # category: Sustainment Systems + # max_depth: 3 + # delay: 5.0 + # tier: 4 + # notes: "Sam Thayer's site — articles" + # + # - url: https://mountainroseherbs.com/blog + # category: Medical + # max_depth: 2 + # delay: 5.0 + # tier: 4 + # notes: "Herb profiles and preparations — blog section only" + # + # - url: https://herbalprepper.com + # category: Medical + # max_depth: 3 + # delay: 5.0 + # tier: 4 + # notes: "Cat Ellis — grid-down herbalism" + # + # - url: https://prolongedfieldcare.org + # category: Medical + # max_depth: 3 + # delay: 5.0 + # tier: 4 + # notes: "PFC Collective — austere medical protocols" + # +service: + scan_interval: 3600 # Seconds between library scans (1 hour) + stage_poll_interval: 30 # Seconds stages sleep when idle + progress_interval: 60 # Seconds between progress log lines + +peertube: + api_base: http://192.168.1.170 # Internal PeerTube API (CT 110 nginx) + public_url: https://stream.echo6.co # Public URL for video links + fetch_timeout: 30 # HTTP timeout for API/VTT requests + rate_limit_delay: 0.5 # Delay between video ingestions (seconds) + +# Stream B: New Library Pipeline +new_pipeline: + # Disabled 2026-04-14 for refactor — see refactored-recon repo for context + enabled: false + acquired_dir: /mnt/library/_acquired + ingest_dir: /mnt/library/_ingest + duplicates_dir: /mnt/library/_ingest/_duplicates + failed_dir: /mnt/library/_ingest/_failed + poll_interval: 60 + mtime_stability: 10 + pilot_domain: "Civil Organization" + spaces_to_underscores: true + +# Refactored pipeline configuration (2026-04-14) +# See https://forge.echo6.co/matt/refactored-recon for design +pipeline: + acquired_root: /opt/recon/data/acquired + processing_root: /opt/recon/data/processing + # Subfolder name -> processor module mapping + # Processors do not exist yet; this is scaffolding for Phase 3+ + dispatch: + pdf: pdf_processor + stream: transcript_processor + html: html_processor + # mtime stability threshold for picking up files from acquired/ + mtime_stability_seconds: 10 diff --git a/enricher.py b/enricher.py new file mode 100644 index 0000000..a594bdd --- /dev/null +++ b/enricher.py @@ -0,0 +1,264 @@ +import json +import os +import re +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +import google.generativeai as genai + +from .utils import get_config, setup_logging +from .status import StatusDB + +logger = setup_logging('recon.enricher') + + +def repair_json(text): + """Attempt to repair common LLM JSON output issues including truncation.""" + # Remove control characters except newlines and tabs + text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text) + # Remove trailing commas before } or ] + text = re.sub(r',\s*([}\]])', r'\1', text) + + # Handle truncated JSON: try to find the last complete object in the array + try: + json.loads(text, strict=False) + return text + except json.JSONDecodeError: + pass + + # Find the last complete }, then close the array + # Walk backward to find the last valid closing brace + last_complete = -1 + depth_brace = 0 + depth_bracket = 0 + in_string = False + escape = False + + for i, ch in enumerate(text): + if escape: + escape = False + continue + if ch == '\\' and in_string: + escape = True + continue + if ch == '"' and not escape: + in_string = not in_string + continue + if in_string: + continue + if ch == '{': + depth_brace += 1 + elif ch == '}': + depth_brace -= 1 + if depth_brace == 0: + last_complete = i + elif ch == '[': + depth_bracket += 1 + elif ch == ']': + depth_bracket -= 1 + + if last_complete > 0: + truncated = text[:last_complete + 1].rstrip().rstrip(',') + # Close any open arrays + open_brackets = truncated.count('[') - truncated.count(']') + truncated += ']' * open_brackets + return truncated + + return text + +ENRICH_PROMPT = """Extract knowledge concepts from this document text. + +A concept is a SELF-CONTAINED piece of knowledge that can stand alone. + +For each concept, provide ALL fields: + +Required: +- content: Full text of the concept (complete procedure, definition, etc.) +- summary: 1-2 sentence summary +- title: Brief descriptive title +- domain: Array of 1-5 from: Foundational Skills, Sustainment Systems, Defense & Tactics, Off-Grid Systems, Communications, Scenario Playbooks, Reference +- subdomain: Array of specific subcategories (up to 10) +- keywords: Array of 3-30 searchable terms +- skill_level: novice | intermediate | advanced +- key_facts: Array of specific extractable claims, measurements, data points + +Optional (include when present): +- scenario_applicable: Array from: tuesday_prepper, month_prepper, year_prepper, multi_year, eotwawki +- cross_domain_tags: Array from: sustainment, medical, security, communications, leadership, logistics, navigation, power_systems, water_systems, food_systems, tactical_ops, community_coordination +- chapter: Chapter name if identifiable +- page_ref: Page reference +- notes: Any additional context + +Return JSON array. If no extractable concepts, return []. + +Document text: +""" + + +class KeyRotator: + def __init__(self, keys): + self.keys = keys + self.index = 0 + + def next(self): + if not self.keys: + raise ValueError("No Gemini API keys configured") + key = self.keys[self.index % len(self.keys)] + self.index += 1 + return key + + +def enrich_window(text, key, config): + genai.configure(api_key=key) + model = genai.GenerativeModel( + config['gemini']['model'], + generation_config={"response_mime_type": config['gemini']['response_mime_type']} + ) + response = model.generate_content(ENRICH_PROMPT + text) + raw = response.text + try: + return json.loads(raw, strict=False) + except json.JSONDecodeError: + repaired = repair_json(raw) + return json.loads(repaired, strict=False) + + +def enrich_single(file_hash, db, config, key_rotator): + doc = db.get_document(file_hash) + if not doc: + return False + + text_dir = os.path.join(config['paths']['text'], file_hash) + concepts_dir = os.path.join(config['paths']['concepts'], file_hash) + window_size = config['processing']['enrich_window_size'] + delay = config['processing']['rate_limit_delay'] + max_retries = config['processing']['max_retries'] + + if not os.path.exists(text_dir): + db.mark_failed(file_hash, f"Text directory not found: {text_dir}") + return False + + db.update_status(file_hash, 'enriching') + + try: + os.makedirs(concepts_dir, exist_ok=True) + + page_files = sorted([f for f in os.listdir(text_dir) if f.startswith('page_') and f.endswith('.txt')]) + if not page_files: + db.mark_failed(file_hash, "No page files found") + return False + + pages_text = [] + for pf in page_files: + with open(os.path.join(text_dir, pf), encoding='utf-8') as f: + pages_text.append(f.read()) + + windows = [] + for i in range(0, len(pages_text), window_size): + window_pages = pages_text[i:i + window_size] + combined = "\n\n".join(f"--- Page {i + j + 1} ---\n{t}" for j, t in enumerate(window_pages)) + windows.append((i, combined)) + + total_concepts = 0 + for w_idx, (start_page, window_text) in enumerate(windows): + window_file = os.path.join(concepts_dir, f"window_{w_idx+1:04d}.json") + + if os.path.exists(window_file): + with open(window_file, encoding='utf-8') as f: + existing = json.load(f) + total_concepts += len(existing) + logger.debug(f" Window {w_idx+1} already exists, skipping") + continue + + if len(window_text.strip()) < 50: + with open(window_file, 'w') as f: + json.dump([], f) + continue + + concepts = None + for attempt in range(max_retries): + try: + key = key_rotator.next() + concepts = enrich_window(window_text, key, config) + break + except Exception as e: + logger.warning(f" Window {w_idx+1} attempt {attempt+1} failed: {e}") + if attempt < max_retries - 1: + time.sleep(delay * (attempt + 1) * 2) + + if concepts is None: + db.mark_failed(file_hash, f"All retries failed for window {w_idx+1}") + return False + + if not isinstance(concepts, list): + concepts = [concepts] if isinstance(concepts, dict) else [] + + for c_idx, concept in enumerate(concepts): + concept['_window'] = w_idx + 1 + concept['_start_page'] = start_page + 1 + concept['_doc_hash'] = file_hash + + # JSON FIRST: save before anything else + with open(window_file, 'w', encoding='utf-8') as f: + json.dump(concepts, f, indent=2, ensure_ascii=False) + + total_concepts += len(concepts) + logger.debug(f" Window {w_idx+1}/{len(windows)}: {len(concepts)} concepts") + time.sleep(delay) + + meta = { + 'hash': file_hash, + 'total_windows': len(windows), + 'total_concepts': total_concepts, + 'window_size': window_size, + 'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()), + } + with open(os.path.join(concepts_dir, 'meta.json'), 'w') as f: + json.dump(meta, f, indent=2) + + db.update_status(file_hash, 'enriched', concepts_extracted=total_concepts) + logger.info(f"Enriched {doc['filename']}: {total_concepts} concepts from {len(windows)} windows") + return True + + except Exception as e: + logger.error(f"Enrichment failed for {file_hash}: {e}\n{traceback.format_exc()}") + db.mark_failed(file_hash, str(e)) + return False + + +def run_enrichment(workers=None, limit=None): + config = get_config() + db = StatusDB() + workers = workers or config['processing']['enrich_workers'] + + keys = config.get('gemini_keys', []) + if not keys: + logger.error("No Gemini API keys configured in .env") + return 0 + + key_rotator = KeyRotator(keys) + + extracted = db.get_by_status('extracted', limit=limit) + if not extracted: + logger.info("No extracted documents to enrich") + return 0 + + logger.info(f"Enriching {len(extracted)} documents with {workers} workers, {len(keys)} API key(s)") + success = 0 + + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = { + pool.submit(enrich_single, doc['hash'], StatusDB(), config, key_rotator): doc + for doc in extracted + } + for future in as_completed(futures): + doc = futures[future] + try: + if future.result(): + success += 1 + except Exception as e: + logger.error(f"Worker error for {doc['hash']}: {e}") + + logger.info(f"Enrichment complete: {success}/{len(extracted)} succeeded") + return success diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/api.py b/lib/api.py new file mode 100644 index 0000000..4ceab68 --- /dev/null +++ b/lib/api.py @@ -0,0 +1,1930 @@ +""" +RECON Web Dashboard & API + +Flask app on port 8420. Jinja2 templates + static files. +Pages: Knowledge (Dashboard, Catalogue, Upload, Web Ingest, Failures), + PeerTube (Dashboard, Channels), Search, Settings (Keys, Cookies, VPN, Health). +API endpoints for all pipeline operations including crawl, ingest, and search. + +Dependencies: Flask, qdrant-client, requests +Config: web, vector_db, embedding sections of config.yaml +""" +import json +import threading +import os +import shutil +import tempfile + +import requests as http_requests +from flask import Flask, request, jsonify, redirect, render_template +from qdrant_client import QdrantClient +from qdrant_client.models import Filter, FieldCondition, MatchValue +from werkzeug.utils import secure_filename + +from .utils import get_config, content_hash, clean_filename_to_title, derive_source_and_category, generate_download_url, setup_logging +from .status import StatusDB + +logger = setup_logging('recon.api') + +# ── Background cache warmer ── +# All expensive queries run proactively so API endpoints never block. +_cache = { + 'knowledge_stats': None, + 'pt_dashboard': None, + 'qdrant_scroll': None, + 'qdrant_scroll_ts': 0, + 'quick_stats': None, +} + +app = Flask(__name__, + template_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'templates'), + static_folder=os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static')) + +# ── Navigation Constants ── + +KNOWLEDGE_SUBNAV = [ + {'href': '/', 'label': 'Dashboard'}, + {'href': '/catalogue', 'label': 'Catalogue'}, + {'href': '/upload', 'label': 'Upload'}, + {'href': '/web-ingest', 'label': 'Web Ingest'}, + {'href': '/failures', 'label': 'Failures'}, +] + +PEERTUBE_SUBNAV = [ + {'href': '/peertube', 'label': 'Dashboard'}, + {'href': '/peertube/channels', 'label': 'Channels'}, +] + +SETTINGS_SUBNAV = [ + {'href': '/settings/keys', 'label': 'API Keys'}, + {'href': '/settings/cookies', 'label': 'YouTube Cookies'}, + {'href': '/settings/vpn', 'label': 'NordVPN'}, + {'href': '/settings/health', 'label': 'Service Health'}, +] + + +def _format_source_citation(payload): + """Format a human-readable citation from a search result payload.""" + book = payload.get('book_title', '') + if not book: + book = clean_filename_to_title(payload.get('filename', 'Unknown')) + page = payload.get('page_ref', '') + if page: + page_str = str(page) + if not page_str.startswith('p'): + page_str = f"p. {page_str}" + return f"{book}, {page_str}" + return book + + +def _resolve_upload_path(category, config): + """Resolve the target directory for an upload given a category name.""" + upload_paths = config.get('upload_paths', {}) + library_root = config['library_root'] + + if category in upload_paths: + return upload_paths[category] + + default_path = upload_paths.get('default', library_root) + safe_category = secure_filename(category) if category else '' + if safe_category: + return os.path.join(default_path, safe_category) + return default_path + + +def _process_upload(filepath, original_filename, category, config, db): + """Process a single PDF upload: hash, dedup, copy to library, catalogue, queue.""" + library_root = config['library_root'] + + file_hash = content_hash(filepath) + + conn = db._get_conn() + existing = conn.execute("SELECT * FROM catalogue WHERE hash = ?", (file_hash,)).fetchone() + if existing: + raise ValueError(f"Duplicate: file already catalogued as {existing['filename']}") + + target_dir = _resolve_upload_path(category, config) + os.makedirs(target_dir, exist_ok=True) + + safe_name = secure_filename(original_filename) + if not safe_name: + safe_name = f"{file_hash}.pdf" + target_path = os.path.join(target_dir, safe_name) + + if os.path.exists(target_path): + base, ext = os.path.splitext(safe_name) + target_path = os.path.join(target_dir, f"{base}_{file_hash[:8]}{ext}") + + shutil.copy2(filepath, target_path) + size = os.path.getsize(target_path) + + source, derived_category = derive_source_and_category(target_path, library_root) + + db.add_to_catalogue(file_hash, safe_name, target_path, size, source, derived_category) + db.queue_document(file_hash) + + return { + 'hash': file_hash, + 'filename': safe_name, + 'category': derived_category, + 'source': source, + 'path': target_path, + 'size_bytes': size, + 'status': 'queued' + } + + +# ── Page Routes ── + +@app.route('/') +def dashboard(): + return render_template('knowledge/dashboard.html', + domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/') + + +@app.route('/search') +def search_page(): + query = request.args.get('q', '') + if not query: + return render_template('search.html', domain='search', subnav=None, active_page='/search') + + config = get_config() + limit = int(request.args.get('limit', 20)) + source_filter = request.args.get('source_type', None) + + try: + from .embedder import get_embedding_single + query_vector = get_embedding_single(query, config) + + qdrant = QdrantClient( + host=config['vector_db']['host'], + port=config['vector_db']['port'], + timeout=60 + ) + + search_filter = None + if source_filter: + search_filter = Filter(must=[ + FieldCondition(key="source_type", match=MatchValue(value=source_filter)) + ]) + + results = qdrant.query_points( + collection_name=config['vector_db']['collection'], + query=query_vector, + limit=limit, + query_filter=search_filter + ).points + + formatted = [] + for r in results: + p = r.payload + raw_dom = p.get('domain', []) + if isinstance(raw_dom, str): + domains = [raw_dom] if raw_dom else [] + elif isinstance(raw_dom, list): + domains = raw_dom + else: + domains = [] + formatted.append({ + 'score': r.score, + 'title': p.get('title', 'Untitled'), + 'summary': p.get('summary', p.get('content', '')[:200]), + 'citation': _format_source_citation(p), + 'download_url': p.get('download_url', ''), + 'source_type': p.get('source_type', 'document'), + 'knowledge_type': p.get('knowledge_type', ''), + 'complexity': p.get('complexity', ''), + 'domains': domains, + }) + + return render_template('search.html', domain='search', subnav=None, active_page='/search', + query=query, results=formatted) + + except Exception as e: + return render_template('search.html', domain='search', subnav=None, active_page='/search', + query=query, error=str(e)) + + +@app.route('/catalogue') +def catalogue_page(): + db = StatusDB() + source = request.args.get('source', None) + category = request.args.get('category', None) + per_page = int(request.args.get('per_page', 50)) + page = int(request.args.get('page', 1)) + if page < 1: + page = 1 + + offset = (page - 1) * per_page + total_count = db.count_documents(source=source, category=category) + total_pages = max(1, (total_count + per_page - 1) // per_page) + if page > total_pages: + page = total_pages + offset = (page - 1) * per_page + + docs = db.get_all_documents(source=source, category=category, limit=per_page, offset=offset) + sources = db.get_sources() + + return render_template('knowledge/catalogue.html', + domain='knowledge', subnav=KNOWLEDGE_SUBNAV, active_page='/catalogue', + docs=docs, sources=sources, current_source=source, + page=page, per_page=per_page, total_pages=total_pages, total_count=total_count) + + +@app.route('/upload') +def upload_page(): + db = StatusDB() + config = get_config() + + upload_paths = config.get('upload_paths', {}) + categories = sorted(k for k in upload_paths if k != 'default') + db_sources = db.get_sources() + for s in db_sources: + if s not in categories: + categories.append(s) + + options_html = ''.join(f'