recon/config.yaml

524 lines
16 KiB
YAML
Raw Normal View History

# RECON Configuration
# See PROJECT-BIBLE.md Section 11 for full documentation
# Root path for the PDF library (NFS mount from pi-nas)
library_root: /mnt/library
processing:
max_pdf_size_mb: 2000 # Raised from 200MB default for large scanned books
extract_workers: 4 # Concurrent PDF extraction threads
enrich_workers: 16 # Concurrent Gemini enrichment threads (4 keys x 4)
embed_workers: 4 # Concurrent embedding threads
enrich_window_size: 5 # Pages per enrichment window (sent to Gemini)
embed_batch_size: 500 # Vectors per Qdrant upsert batch
rate_limit_delay: 0.1 # Delay between Gemini API calls (seconds)
max_retries: 5 # Max retries for failed documents
extract_timeout: 1800 # Max seconds per document extraction (30 min, allows vision OCR)
page_timeout: 30 # Max seconds per page extraction
enrich_max_retries: 5 # Max retries per enrichment window
enrich_base_delay: 5.0 # Base backoff delay (seconds) — ~5s, 10s, 20s, 40s, 80s
enrich_max_delay: 120.0 # Maximum backoff delay cap (seconds)
embedding:
backend: tei # "tei" (primary, ~1,711 emb/sec) or "ollama" (fallback, ~8 emb/sec)
tei_host: 100.64.0.14 # TEI server (cortex)
tei_port: 8090 # TEI HTTP port
ollama_host: 100.64.0.14 # Ollama server (cortex) — fallback only
ollama_port: 11434 # Ollama HTTP port
model: bge-m3 # Embedding model name
dimensions: 1024 # CRITICAL: bge-m3 is 1024-dim, NOT 384
batch_size: 128 # Embeddings per TEI batch request
sparse_embedding:
enabled: true
host: 100.64.0.14 # Sparse embedding service (cortex)
port: 8091 # Sparse embedding HTTP port
vector_db:
host: 100.64.0.14 # Qdrant server (cortex)
port: 6333 # Qdrant HTTP port
collection: recon_knowledge_hybrid # Collection name
gemini:
model: gemini-2.0-flash # Gemini model for enrichment
response_mime_type: application/json # Force JSON output from Gemini
web:
port: 8420 # Dashboard HTTP port
host: 0.0.0.0 # Bind address (all interfaces)
paths:
base: /opt/recon # Application root
data: /opt/recon/data # Data directory
text: /opt/recon/data/text # Extracted text output (data/text/{hash}/page_NNNN.txt)
concepts: /opt/recon/data/concepts # Enriched concept JSONs (data/concepts/{hash}/window_N.json)
intel: /opt/recon/data/intel # ARGUS intel feeds
logs: /opt/recon/logs # Log files
db: /opt/recon/data/recon.db # SQLite database (WAL mode)
book_server:
base_url: https://files.echo6.co # Public URL prefix for PDF downloads
strip_prefix: /mnt/library # Path prefix stripped when generating download URLs
upload_paths: # Category -> filesystem path mapping for uploads
Survival Reference: /mnt/library/Survival-Companion-Library/Uploads
Military Doctrine: /mnt/library/Army_Pubs/Uploads
Gaming: /mnt/library/Gaming
Reference: /mnt/library/Reference
Technical: /mnt/library/Technical
default: /mnt/library # Fallback for unknown categories
web_scraper:
words_per_page: 2000 # Target words per page chunk for web content
fetch_timeout: 30 # HTTP request timeout (seconds)
rate_limit_delay: 1.0 # Delay between URL fetches (seconds)
max_batch_size: 50 # Max URLs per batch ingest
user_agent: "Mozilla/5.0 (compatible; RECON/1.0)"
crawler:
user_agent: "Mozilla/5.0 (compatible; RECON/1.0)"
fetch_timeout: 30 # HTTP request timeout (seconds)
rate_limit_delay: 1.0 # Delay between page fetches (seconds)
max_pages: 500 # Max pages to discover per crawl
max_depth: 3 # Max link-following depth (BFS only, not sitemap)
inter_site_cooldown: 30 # Seconds to wait between crawling different sites
recrawl_interval_days: 7 # Skip sites crawled within this many days
default_exclude: # URL patterns always excluded from crawling
- /search
- /404
- /login
- /signup
- /auth/
- /api/
- /assets/
- /static/
- /cart
- /checkout
- /account
- /register
- /subscribe
- /membership
- /shop
- /store
- /product
- /wp-admin
- /feed
- /wp-json
- /xmlrpc
- /.well-known
- /cdn-cgi
# ─── Crawl Targets ─────────────────────────────────────────────
# Sites are crawled by the scheduler loop in tier order (1 first).
# Per-site delay overrides global rate_limit_delay for that site.
# Per-site max_pages/max_depth override global defaults.
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
sites: []
# sites:
#
# # ═══ TIER 1 — Free, authoritative, high-density ═══
#
# - url: https://hesperian.org/all-hesperian-health-guides
# category: Medical
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Free health guides — WTIND, midwives, community health"
#
# - url: https://swsbm.com
# category: Medical
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Michael Moore's entire free clinical herbal library — PDFs"
#
# - url: https://swsbm.henriettesherbal.com
# category: Medical
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Mirror of Moore's library — grab both"
#
# - url: https://nchfp.uga.edu
# category: Sustainment Systems
# max_depth: 3
# delay: 2.0
# tier: 1
# notes: "USDA canning/preservation safety authority"
#
# - url: https://extension.uidaho.edu
# category: Foundational Skills
# max_depth: 3
# delay: 2.0
# tier: 1
# notes: "Idaho-specific — soil, water, crops, livestock"
#
# - url: https://extension.usu.edu
# category: Foundational Skills
# max_depth: 3
# delay: 2.0
# tier: 1
# notes: "Utah State — Idaho-adjacent climate"
#
# - url: https://attra.ncat.org
# category: Sustainment Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "ATTRA sustainable ag — hundreds of free publications"
#
# - url: https://pfaf.org
# category: Sustainment Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Plants For A Future — 7,000+ edible/medicinal plant profiles"
#
# - url: https://eattheweeds.com
# category: Sustainment Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Green Deane — 1,000+ foraging plant articles"
#
# - url: https://lowtechmagazine.com
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Exceptional low-tech systems analysis"
#
# - url: https://appropedia.org
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Appropriate technology wiki"
#
# - url: https://journeytoforever.org
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "VITA manuals, biodiesel, biogas, hand tools archive"
#
# - url: https://cd3wd.com
# category: Off-Grid Systems
# max_depth: 2
# delay: 3.0
# tier: 1
# notes: "1,050+ appropriate technology eBooks — index pages only"
#
# - url: https://practicalselfreliance.com
# category: Sustainment Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Ashley Adamant — foraging, preservation, homesteading"
#
# - url: https://open.oregonstate.edu/permaculture
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Millison's free permaculture textbook"
#
# - url: https://open.oregonstate.edu/permaculturedesign
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Millison's advanced permaculture textbook"
#
# - url: https://mushroomexpert.com
# category: Sustainment Systems
# max_depth: 3
# delay: 3.0
# tier: 1
# notes: "Michael Kuo — mushroom ID, taxonomy, regional coverage"
#
# # ═══ TIER 2 — High value, second pass ═══
#
# - url: https://motherearthnews.com
# category: Foundational Skills
# max_depth: 2
# max_pages: 200
# delay: 8.0
# tier: 2
# notes: "50 years of homesteading archive — large commercial site, be polite"
#
# - url: https://permacultureresearchinstitute.com
# category: Off-Grid Systems
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Geoff Lawton — articles, case studies"
#
# - url: https://learnyourland.com
# category: Sustainment Systems
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Adam Haritan — foraging articles"
#
# - url: https://herbswithRosalee.com
# category: Medical
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Rosalee de la Foret — clinical herbalism articles"
#
# - url: https://commonwealthherbs.com
# category: Medical
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Katja and Ryn — clinical herbalism"
#
# - url: https://soilfoodweb.com
# category: Off-Grid Systems
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Elaine Ingham soil biology — archive before it goes dark"
#
# - url: https://rocketstoves.com
# category: Off-Grid Systems
# max_depth: 3
# delay: 5.0
# tier: 2
# notes: "Ianto Evans — rocket mass heater designs and PDFs"
#
# - url: https://farmsteadmeatsmith.com
# category: Sustainment Systems
# max_depth: 2
# delay: 5.0
# tier: 2
# notes: "Brandon Sheard — butchering articles (free content only)"
#
# - url: https://deeranddeerhunting.com
# category: Sustainment Systems
# max_depth: 2
# delay: 5.0
# tier: 2
# notes: "Field dressing, processing, hunting technique library"
#
# # ═══ TIER 3 — Government (authoritative) ═══
#
# - url: https://plants.usda.gov
# category: Sustainment Systems
# max_depth: 2
# delay: 2.0
# tier: 3
# notes: "USDA native plant database"
#
# - url: https://ars.usda.gov
# category: Sustainment Systems
# max_depth: 2
# delay: 2.0
# tier: 3
# notes: "USDA Agricultural Research publications"
#
# - url: https://nrcs.usda.gov
# category: Off-Grid Systems
# max_depth: 2
# delay: 2.0
# tier: 3
# notes: "Soil surveys, conservation practice standards"
#
# - url: https://ready.gov
# category: Scenario Playbooks
# max_depth: 3
# delay: 2.0
# tier: 3
# notes: "FEMA emergency preparedness guides"
#
# - url: https://emergency.cdc.gov
# category: Medical
# max_depth: 3
# delay: 2.0
# tier: 3
# notes: "Public health emergency references"
#
# - url: https://agri.idaho.gov
# category: Foundational Skills
# max_depth: 2
# delay: 2.0
# tier: 3
# notes: "Idaho Dept of Agriculture — local relevance"
#
# - url: https://driveonwood.com
# category: Off-Grid Systems
# max_depth: 3
# delay: 3.0
# tier: 3
# notes: "Wood gasification — FEMA manual + modern improvements"
#
# # ═══ TIER 4 — Selective scrape (specific sections only) ═══
#
# - url: https://richsoil.com
# category: Off-Grid Systems
# max_depth: 2
# delay: 5.0
# tier: 4
# notes: "Paul Wheaton — rocket mass heaters, natural building"
#
# - url: https://wildfoodgirl.com
# category: Sustainment Systems
# max_depth: 3
# delay: 5.0
# tier: 4
# notes: "Colorado foraging — Mountain West species"
#
# - url: https://foragersharvest.com
# category: Sustainment Systems
# max_depth: 3
# delay: 5.0
# tier: 4
# notes: "Sam Thayer's site — articles"
#
# - url: https://mountainroseherbs.com/blog
# category: Medical
# max_depth: 2
# delay: 5.0
# tier: 4
# notes: "Herb profiles and preparations — blog section only"
#
# - url: https://herbalprepper.com
# category: Medical
# max_depth: 3
# delay: 5.0
# tier: 4
# notes: "Cat Ellis — grid-down herbalism"
#
# - url: https://prolongedfieldcare.org
# category: Medical
# max_depth: 3
# delay: 5.0
# tier: 4
# notes: "PFC Collective — austere medical protocols"
#
service:
scan_interval: 3600 # Seconds between library scans (1 hour)
stage_poll_interval: 30 # Seconds stages sleep when idle
progress_interval: 60 # Seconds between progress log lines
peertube:
api_base: http://192.168.1.170 # Internal PeerTube API (CT 110 nginx)
public_url: https://stream.echo6.co # Public URL for video links
fetch_timeout: 30 # HTTP timeout for API/VTT requests
rate_limit_delay: 0.5 # Delay between video ingestions (seconds)
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
scraper:
workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
rate_limit_delay: 0.5 # Seconds between wget requests (--wait)
wait_random: 1.0 # Random jitter added to wait (--random-wait range)
default_language: eng # ISO 639-3 language code for ZIM metadata
user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
poll_interval: 300 # Seconds between checking for pending scrape jobs
keep_workspace_on_failure: true # Retain workspace for debugging when a job fails
# Default URL patterns rejected by wget --reject-regex.
# Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
# Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
default_reject_patterns:
# WordPress
- '\?share='
- '\?replytocom='
- '\?like_comment='
- '/feed/'
- '/wp-json/'
- '/wp-login'
- '/wp-admin'
- '/wp-cron'
- '\?attachment_id='
- '/xmlrpc'
- '/trackback'
- '/comment-page-'
- '\?doing_wp_cron'
# Squarespace
- '\?format=json'
- '\?format=rss'
- '/api/'
# Wix
- '/_api/'
- '/_partials/'
# Ghost
- '/ghost/'
- '/p/'
# Drupal
- '\?q=comment'
- '\?q=node'
- '/user/login'
- '/user/register'
# General CMS / site chrome
- '/login'
- '/signup'
- '/register'
- '/cart'
- '/checkout'
- '/search\?'
- '/tag/'
- '/author/'
- '\?print='
- '\?pdf='
- '\?format=amp'
- '\?preview='
- '/rss'
- '/atom'
- '/cdn-cgi/'
# Pre-flight mode detection
preflight:
enabled: true
timeout: 30 # Seconds for single-page Playwright fetch
min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site
min_browser_size: 20480 # 20KB - browser HTML above this confirms JS
spa_markers:
- 'div#root'
- 'div#app'
- 'div#__next'
# SingleFile CLI settings (browser crawl mode)
singlefile:
executable: single-file
chromium_path: "" # Auto-detected from Playwright if empty
crawl_max_depth: 10
crawl_delay: 2 # Seconds between page fetches
# Stream B: New Library Pipeline
new_pipeline:
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
enabled: false
acquired_dir: /mnt/library/_acquired
ingest_dir: /mnt/library/_ingest
duplicates_dir: /mnt/library/_ingest/_duplicates
failed_dir: /mnt/library/_ingest/_failed
poll_interval: 60
mtime_stability: 10
pilot_domain: "Civil Organization"
spaces_to_underscores: true
# Refactored pipeline configuration (2026-04-14)
# See https://forge.echo6.co/matt/refactored-recon for design
pipeline:
acquired_root: /opt/recon/data/acquired
processing_root: /opt/recon/data/processing
# Subfolder name -> processor module mapping
# Processors do not exist yet; this is scaffolding for Phase 3+
dispatch:
pdf: pdf_processor
stream: transcript_processor
html: html_processor
text: text_processor
# mtime stability threshold for picking up files from acquired/
mtime_stability_seconds: 10
# Language filter: skip non-English content before Gemini enrichment
language_filter: true # Enable langdetect-based filtering
allowed_languages: # ISO 639-1 codes allowed through enrichment
- en