2026-04-14 14:57:23 +00:00
|
|
|
# RECON Configuration
|
|
|
|
|
# See PROJECT-BIBLE.md Section 11 for full documentation
|
|
|
|
|
|
|
|
|
|
# Root path for the PDF library (NFS mount from pi-nas)
|
|
|
|
|
library_root: /mnt/library
|
|
|
|
|
|
|
|
|
|
processing:
|
|
|
|
|
max_pdf_size_mb: 2000 # Raised from 200MB default for large scanned books
|
|
|
|
|
extract_workers: 4 # Concurrent PDF extraction threads
|
|
|
|
|
enrich_workers: 16 # Concurrent Gemini enrichment threads (4 keys x 4)
|
|
|
|
|
embed_workers: 4 # Concurrent embedding threads
|
|
|
|
|
enrich_window_size: 5 # Pages per enrichment window (sent to Gemini)
|
|
|
|
|
embed_batch_size: 500 # Vectors per Qdrant upsert batch
|
|
|
|
|
rate_limit_delay: 0.1 # Delay between Gemini API calls (seconds)
|
|
|
|
|
max_retries: 5 # Max retries for failed documents
|
|
|
|
|
extract_timeout: 1800 # Max seconds per document extraction (30 min, allows vision OCR)
|
|
|
|
|
page_timeout: 30 # Max seconds per page extraction
|
|
|
|
|
enrich_max_retries: 5 # Max retries per enrichment window
|
|
|
|
|
enrich_base_delay: 5.0 # Base backoff delay (seconds) — ~5s, 10s, 20s, 40s, 80s
|
|
|
|
|
enrich_max_delay: 120.0 # Maximum backoff delay cap (seconds)
|
|
|
|
|
|
|
|
|
|
embedding:
|
|
|
|
|
backend: tei # "tei" (primary, ~1,711 emb/sec) or "ollama" (fallback, ~8 emb/sec)
|
|
|
|
|
tei_host: 100.64.0.14 # TEI server (cortex)
|
|
|
|
|
tei_port: 8090 # TEI HTTP port
|
|
|
|
|
ollama_host: 100.64.0.14 # Ollama server (cortex) — fallback only
|
|
|
|
|
ollama_port: 11434 # Ollama HTTP port
|
|
|
|
|
model: bge-m3 # Embedding model name
|
|
|
|
|
dimensions: 1024 # CRITICAL: bge-m3 is 1024-dim, NOT 384
|
|
|
|
|
batch_size: 128 # Embeddings per TEI batch request
|
|
|
|
|
|
|
|
|
|
sparse_embedding:
|
|
|
|
|
enabled: true
|
|
|
|
|
host: 100.64.0.14 # Sparse embedding service (cortex)
|
|
|
|
|
port: 8091 # Sparse embedding HTTP port
|
|
|
|
|
|
|
|
|
|
vector_db:
|
|
|
|
|
host: 100.64.0.14 # Qdrant server (cortex)
|
|
|
|
|
port: 6333 # Qdrant HTTP port
|
|
|
|
|
collection: recon_knowledge_hybrid # Collection name
|
|
|
|
|
|
|
|
|
|
gemini:
|
|
|
|
|
model: gemini-2.0-flash # Gemini model for enrichment
|
|
|
|
|
response_mime_type: application/json # Force JSON output from Gemini
|
|
|
|
|
|
|
|
|
|
web:
|
|
|
|
|
port: 8420 # Dashboard HTTP port
|
|
|
|
|
host: 0.0.0.0 # Bind address (all interfaces)
|
|
|
|
|
|
|
|
|
|
paths:
|
|
|
|
|
base: /opt/recon # Application root
|
|
|
|
|
data: /opt/recon/data # Data directory
|
|
|
|
|
text: /opt/recon/data/text # Extracted text output (data/text/{hash}/page_NNNN.txt)
|
|
|
|
|
concepts: /opt/recon/data/concepts # Enriched concept JSONs (data/concepts/{hash}/window_N.json)
|
|
|
|
|
intel: /opt/recon/data/intel # ARGUS intel feeds
|
|
|
|
|
logs: /opt/recon/logs # Log files
|
|
|
|
|
db: /opt/recon/data/recon.db # SQLite database (WAL mode)
|
|
|
|
|
|
|
|
|
|
book_server:
|
|
|
|
|
base_url: https://files.echo6.co # Public URL prefix for PDF downloads
|
|
|
|
|
strip_prefix: /mnt/library # Path prefix stripped when generating download URLs
|
|
|
|
|
|
|
|
|
|
upload_paths: # Category -> filesystem path mapping for uploads
|
|
|
|
|
Survival Reference: /mnt/library/Survival-Companion-Library/Uploads
|
|
|
|
|
Military Doctrine: /mnt/library/Army_Pubs/Uploads
|
|
|
|
|
Gaming: /mnt/library/Gaming
|
|
|
|
|
Reference: /mnt/library/Reference
|
|
|
|
|
Technical: /mnt/library/Technical
|
|
|
|
|
default: /mnt/library # Fallback for unknown categories
|
|
|
|
|
|
|
|
|
|
web_scraper:
|
|
|
|
|
words_per_page: 2000 # Target words per page chunk for web content
|
|
|
|
|
fetch_timeout: 30 # HTTP request timeout (seconds)
|
|
|
|
|
rate_limit_delay: 1.0 # Delay between URL fetches (seconds)
|
|
|
|
|
max_batch_size: 50 # Max URLs per batch ingest
|
|
|
|
|
user_agent: "Mozilla/5.0 (compatible; RECON/1.0)"
|
|
|
|
|
|
|
|
|
|
crawler:
|
|
|
|
|
user_agent: "Mozilla/5.0 (compatible; RECON/1.0)"
|
|
|
|
|
fetch_timeout: 30 # HTTP request timeout (seconds)
|
|
|
|
|
rate_limit_delay: 1.0 # Delay between page fetches (seconds)
|
|
|
|
|
max_pages: 500 # Max pages to discover per crawl
|
|
|
|
|
max_depth: 3 # Max link-following depth (BFS only, not sitemap)
|
|
|
|
|
inter_site_cooldown: 30 # Seconds to wait between crawling different sites
|
|
|
|
|
recrawl_interval_days: 7 # Skip sites crawled within this many days
|
|
|
|
|
|
|
|
|
|
default_exclude: # URL patterns always excluded from crawling
|
|
|
|
|
- /search
|
|
|
|
|
- /404
|
|
|
|
|
- /login
|
|
|
|
|
- /signup
|
|
|
|
|
- /auth/
|
|
|
|
|
- /api/
|
|
|
|
|
- /assets/
|
|
|
|
|
- /static/
|
|
|
|
|
- /cart
|
|
|
|
|
- /checkout
|
|
|
|
|
- /account
|
|
|
|
|
- /register
|
|
|
|
|
- /subscribe
|
|
|
|
|
- /membership
|
|
|
|
|
- /shop
|
|
|
|
|
- /store
|
|
|
|
|
- /product
|
|
|
|
|
- /wp-admin
|
|
|
|
|
- /feed
|
|
|
|
|
- /wp-json
|
|
|
|
|
- /xmlrpc
|
|
|
|
|
- /.well-known
|
|
|
|
|
- /cdn-cgi
|
|
|
|
|
|
|
|
|
|
# ─── Crawl Targets ─────────────────────────────────────────────
|
|
|
|
|
# Sites are crawled by the scheduler loop in tier order (1 first).
|
|
|
|
|
# Per-site delay overrides global rate_limit_delay for that site.
|
|
|
|
|
# Per-site max_pages/max_depth override global defaults.
|
|
|
|
|
|
|
|
|
|
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
|
|
|
|
|
sites: []
|
|
|
|
|
|
|
|
|
|
# sites:
|
|
|
|
|
#
|
|
|
|
|
# # ═══ TIER 1 — Free, authoritative, high-density ═══
|
|
|
|
|
#
|
|
|
|
|
# - url: https://hesperian.org/all-hesperian-health-guides
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Free health guides — WTIND, midwives, community health"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://swsbm.com
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Michael Moore's entire free clinical herbal library — PDFs"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://swsbm.henriettesherbal.com
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Mirror of Moore's library — grab both"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://nchfp.uga.edu
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "USDA canning/preservation safety authority"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://extension.uidaho.edu
|
|
|
|
|
# category: Foundational Skills
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Idaho-specific — soil, water, crops, livestock"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://extension.usu.edu
|
|
|
|
|
# category: Foundational Skills
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Utah State — Idaho-adjacent climate"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://attra.ncat.org
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "ATTRA sustainable ag — hundreds of free publications"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://pfaf.org
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Plants For A Future — 7,000+ edible/medicinal plant profiles"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://eattheweeds.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Green Deane — 1,000+ foraging plant articles"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://lowtechmagazine.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Exceptional low-tech systems analysis"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://appropedia.org
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Appropriate technology wiki"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://journeytoforever.org
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "VITA manuals, biodiesel, biogas, hand tools archive"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://cd3wd.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "1,050+ appropriate technology eBooks — index pages only"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://practicalselfreliance.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Ashley Adamant — foraging, preservation, homesteading"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://open.oregonstate.edu/permaculture
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Millison's free permaculture textbook"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://open.oregonstate.edu/permaculturedesign
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Millison's advanced permaculture textbook"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://mushroomexpert.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 1
|
|
|
|
|
# notes: "Michael Kuo — mushroom ID, taxonomy, regional coverage"
|
|
|
|
|
#
|
|
|
|
|
# # ═══ TIER 2 — High value, second pass ═══
|
|
|
|
|
#
|
|
|
|
|
# - url: https://motherearthnews.com
|
|
|
|
|
# category: Foundational Skills
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# max_pages: 200
|
|
|
|
|
# delay: 8.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "50 years of homesteading archive — large commercial site, be polite"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://permacultureresearchinstitute.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Geoff Lawton — articles, case studies"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://learnyourland.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Adam Haritan — foraging articles"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://herbswithRosalee.com
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Rosalee de la Foret — clinical herbalism articles"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://commonwealthherbs.com
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Katja and Ryn — clinical herbalism"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://soilfoodweb.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Elaine Ingham soil biology — archive before it goes dark"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://rocketstoves.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Ianto Evans — rocket mass heater designs and PDFs"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://farmsteadmeatsmith.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Brandon Sheard — butchering articles (free content only)"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://deeranddeerhunting.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 2
|
|
|
|
|
# notes: "Field dressing, processing, hunting technique library"
|
|
|
|
|
#
|
|
|
|
|
# # ═══ TIER 3 — Government (authoritative) ═══
|
|
|
|
|
#
|
|
|
|
|
# - url: https://plants.usda.gov
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "USDA native plant database"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://ars.usda.gov
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "USDA Agricultural Research publications"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://nrcs.usda.gov
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "Soil surveys, conservation practice standards"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://ready.gov
|
|
|
|
|
# category: Scenario Playbooks
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "FEMA emergency preparedness guides"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://emergency.cdc.gov
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "Public health emergency references"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://agri.idaho.gov
|
|
|
|
|
# category: Foundational Skills
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 2.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "Idaho Dept of Agriculture — local relevance"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://driveonwood.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 3.0
|
|
|
|
|
# tier: 3
|
|
|
|
|
# notes: "Wood gasification — FEMA manual + modern improvements"
|
|
|
|
|
#
|
|
|
|
|
# # ═══ TIER 4 — Selective scrape (specific sections only) ═══
|
|
|
|
|
#
|
|
|
|
|
# - url: https://richsoil.com
|
|
|
|
|
# category: Off-Grid Systems
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "Paul Wheaton — rocket mass heaters, natural building"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://wildfoodgirl.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "Colorado foraging — Mountain West species"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://foragersharvest.com
|
|
|
|
|
# category: Sustainment Systems
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "Sam Thayer's site — articles"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://mountainroseherbs.com/blog
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 2
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "Herb profiles and preparations — blog section only"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://herbalprepper.com
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "Cat Ellis — grid-down herbalism"
|
|
|
|
|
#
|
|
|
|
|
# - url: https://prolongedfieldcare.org
|
|
|
|
|
# category: Medical
|
|
|
|
|
# max_depth: 3
|
|
|
|
|
# delay: 5.0
|
|
|
|
|
# tier: 4
|
|
|
|
|
# notes: "PFC Collective — austere medical protocols"
|
|
|
|
|
#
|
|
|
|
|
service:
|
|
|
|
|
scan_interval: 3600 # Seconds between library scans (1 hour)
|
|
|
|
|
stage_poll_interval: 30 # Seconds stages sleep when idle
|
|
|
|
|
progress_interval: 60 # Seconds between progress log lines
|
|
|
|
|
|
|
|
|
|
peertube:
|
|
|
|
|
api_base: http://192.168.1.170 # Internal PeerTube API (CT 110 nginx)
|
|
|
|
|
public_url: https://stream.echo6.co # Public URL for video links
|
|
|
|
|
fetch_timeout: 30 # HTTP timeout for API/VTT requests
|
|
|
|
|
rate_limit_delay: 0.5 # Delay between video ingestions (seconds)
|
2026-04-15 03:08:51 +00:00
|
|
|
poll_interval: 1800 # Seconds between PeerTube acquisition polls (30 min)
|
2026-04-14 14:57:23 +00:00
|
|
|
|
2026-04-18 18:26:43 +00:00
|
|
|
scraper:
|
|
|
|
|
workspace: /opt/recon/data/scraper # Working directory for wget mirrors + ZIM builds
|
|
|
|
|
output_dir: /mnt/kiwix # Finished .zim files land here (kiwix-serve library)
|
|
|
|
|
rate_limit_delay: 0.5 # Seconds between wget requests (--wait)
|
|
|
|
|
wait_random: 1.0 # Random jitter added to wait (--random-wait range)
|
|
|
|
|
default_language: eng # ISO 639-3 language code for ZIM metadata
|
|
|
|
|
user_agent: "Mozilla/5.0 (compatible; RECON/1.0; +https://echo6.co)"
|
|
|
|
|
poll_interval: 300 # Seconds between checking for pending scrape jobs
|
|
|
|
|
keep_workspace_on_failure: true # Retain workspace for debugging when a job fails
|
|
|
|
|
|
|
|
|
|
# Default URL patterns rejected by wget --reject-regex.
|
|
|
|
|
# Covers common CMS junk across WordPress, Squarespace, Wix, Ghost, Drupal, etc.
|
|
|
|
|
# Per-job overrides: additional_reject_patterns (appended) or skip_default_patterns (bypass).
|
|
|
|
|
default_reject_patterns:
|
|
|
|
|
# WordPress
|
|
|
|
|
- '\?share='
|
|
|
|
|
- '\?replytocom='
|
|
|
|
|
- '\?like_comment='
|
|
|
|
|
- '/feed/'
|
|
|
|
|
- '/wp-json/'
|
|
|
|
|
- '/wp-login'
|
|
|
|
|
- '/wp-admin'
|
|
|
|
|
- '/wp-cron'
|
|
|
|
|
- '\?attachment_id='
|
|
|
|
|
- '/xmlrpc'
|
|
|
|
|
- '/trackback'
|
|
|
|
|
- '/comment-page-'
|
|
|
|
|
- '\?doing_wp_cron'
|
|
|
|
|
# Squarespace
|
|
|
|
|
- '\?format=json'
|
|
|
|
|
- '\?format=rss'
|
|
|
|
|
- '/api/'
|
|
|
|
|
# Wix
|
|
|
|
|
- '/_api/'
|
|
|
|
|
- '/_partials/'
|
|
|
|
|
# Ghost
|
|
|
|
|
- '/ghost/'
|
|
|
|
|
- '/p/'
|
|
|
|
|
# Drupal
|
|
|
|
|
- '\?q=comment'
|
|
|
|
|
- '\?q=node'
|
|
|
|
|
- '/user/login'
|
|
|
|
|
- '/user/register'
|
|
|
|
|
# General CMS / site chrome
|
|
|
|
|
- '/login'
|
|
|
|
|
- '/signup'
|
|
|
|
|
- '/register'
|
|
|
|
|
- '/cart'
|
|
|
|
|
- '/checkout'
|
|
|
|
|
- '/search\?'
|
|
|
|
|
- '/tag/'
|
|
|
|
|
- '/author/'
|
|
|
|
|
- '\?print='
|
|
|
|
|
- '\?pdf='
|
|
|
|
|
- '\?format=amp'
|
|
|
|
|
- '\?preview='
|
|
|
|
|
- '/rss'
|
|
|
|
|
- '/atom'
|
|
|
|
|
- '/cdn-cgi/'
|
|
|
|
|
|
|
|
|
|
# Pre-flight mode detection
|
|
|
|
|
preflight:
|
|
|
|
|
enabled: true
|
|
|
|
|
timeout: 30 # Seconds for single-page Playwright fetch
|
|
|
|
|
min_static_size: 5120 # 5KB - wget HTML below this = suspect JS site
|
|
|
|
|
min_browser_size: 20480 # 20KB - browser HTML above this confirms JS
|
|
|
|
|
spa_markers:
|
|
|
|
|
- 'div#root'
|
|
|
|
|
- 'div#app'
|
|
|
|
|
- 'div#__next'
|
|
|
|
|
|
|
|
|
|
# SingleFile CLI settings (browser crawl mode)
|
|
|
|
|
singlefile:
|
|
|
|
|
executable: single-file
|
2026-04-19 02:28:49 +00:00
|
|
|
chromium_path: "/usr/bin/chromium-browser"
|
2026-04-18 18:26:43 +00:00
|
|
|
crawl_max_depth: 10
|
|
|
|
|
|
2026-04-14 14:57:23 +00:00
|
|
|
# Stream B: New Library Pipeline
|
|
|
|
|
new_pipeline:
|
|
|
|
|
# Disabled 2026-04-14 for refactor — see refactored-recon repo for context
|
|
|
|
|
enabled: false
|
|
|
|
|
acquired_dir: /mnt/library/_acquired
|
|
|
|
|
ingest_dir: /mnt/library/_ingest
|
|
|
|
|
duplicates_dir: /mnt/library/_ingest/_duplicates
|
|
|
|
|
failed_dir: /mnt/library/_ingest/_failed
|
|
|
|
|
poll_interval: 60
|
|
|
|
|
mtime_stability: 10
|
|
|
|
|
pilot_domain: "Civil Organization"
|
|
|
|
|
spaces_to_underscores: true
|
|
|
|
|
|
|
|
|
|
# Refactored pipeline configuration (2026-04-14)
|
|
|
|
|
# See https://forge.echo6.co/matt/refactored-recon for design
|
|
|
|
|
pipeline:
|
|
|
|
|
acquired_root: /opt/recon/data/acquired
|
|
|
|
|
processing_root: /opt/recon/data/processing
|
|
|
|
|
# Subfolder name -> processor module mapping
|
|
|
|
|
# Processors do not exist yet; this is scaffolding for Phase 3+
|
|
|
|
|
dispatch:
|
|
|
|
|
pdf: pdf_processor
|
|
|
|
|
stream: transcript_processor
|
|
|
|
|
html: html_processor
|
2026-04-15 22:39:31 +00:00
|
|
|
text: text_processor
|
2026-04-14 14:57:23 +00:00
|
|
|
# mtime stability threshold for picking up files from acquired/
|
|
|
|
|
mtime_stability_seconds: 10
|
2026-04-17 14:37:13 +00:00
|
|
|
# Language filter: skip non-English content before Gemini enrichment
|
|
|
|
|
language_filter: true # Enable langdetect-based filtering
|
|
|
|
|
allowed_languages: # ISO 639-1 codes allowed through enrichment
|
|
|
|
|
- en
|