mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 06:34:40 +02:00
Phase 6c: remove vestigial extract worker, dead crawler, .bak files
recon.py: - Remove extract stage_loop thread from cmd_service(). Confirmed vestigial: 0 queued items, silent logs over 24+ hour run. The new processors do extraction inline in pre_flight(). - Remove cmd_crawl CLI subcommand and its argparse registration. - Clean up associated imports and variables. Deleted: - lib/crawler.py (432 lines) -- old web crawler subsystem, only referenced by the removed CLI subcommand. - 24 .bak files (untracked pre-edit safety backups, originals preserved in git history). Investigation found the four old loop function definitions (scanner_loop, peertube_scanner_loop, crawler_scheduler_loop, organizer_loop) were already deleted in Phase 5c-1. Modules investigated and KEPT: - lib/web_scraper.py -- exports chunk_text() used by transcript_processor - lib/new_pipeline.py -- active Stream B library management CLI tool - lib/peertube_scraper.py -- only mechanism for transcript ingestion - lib/extractor.py -- would activate for new PDFs via cmd_run CLI Service restart verified: 6 threads (dispatcher, enrich, embed, filing, progress, dashboard), no extract worker, zero errors. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
70b80cb312
commit
efae4023f6
2 changed files with 3 additions and 521 deletions
92
recon.py
92
recon.py
|
|
@ -3,7 +3,7 @@
|
|||
RECON CLI — Main entry point.
|
||||
|
||||
Subcommands: scan, queue, extract, enrich, embed, run, search, upload,
|
||||
ingest-url, crawl, ingest-peertube, organize, status, catalogue, failures, validate, rebuild, serve, ingest.
|
||||
ingest-url, ingest-peertube, organize, status, catalogue, failures, validate, rebuild, serve, ingest.
|
||||
|
||||
Usage: cd /opt/recon && source venv/bin/activate && python3 recon.py <command>
|
||||
"""
|
||||
|
|
@ -580,73 +580,6 @@ def cmd_ingest_url(args):
|
|||
|
||||
|
||||
|
||||
def cmd_crawl(args):
|
||||
from lib.crawler import crawl_site
|
||||
|
||||
print(f"Crawling {args.url}...")
|
||||
if args.include:
|
||||
print(f" Include paths: {args.include}")
|
||||
if args.exclude:
|
||||
print(f" Exclude paths: {args.exclude}")
|
||||
if args.dry_run:
|
||||
print(f" DRY RUN — no content will be ingested")
|
||||
print()
|
||||
|
||||
result = crawl_site(
|
||||
base_url=args.url,
|
||||
category=args.category,
|
||||
source=args.source,
|
||||
include=args.include,
|
||||
exclude=args.exclude,
|
||||
max_pages=args.max_pages,
|
||||
max_depth=args.max_depth,
|
||||
delay=args.delay,
|
||||
dry_run=args.dry_run,
|
||||
use_sitemap=not args.no_sitemap,
|
||||
)
|
||||
|
||||
method = result.get('discovery_method', 'none')
|
||||
print(f"Discovery method: {method}")
|
||||
|
||||
if args.dry_run:
|
||||
urls = result.get('urls', [])
|
||||
print(f"Found {len(urls)} URLs that would be ingested:\n")
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f" {i:4d}. {url}")
|
||||
print(f"\nTotal: {len(urls)} pages")
|
||||
print(f"Re-run without --dry-run to ingest.")
|
||||
return 0
|
||||
|
||||
summary = result.get('summary', {})
|
||||
print(f"\nResults:")
|
||||
print(f" New: {summary.get('succeeded', 0)}")
|
||||
print(f" Duplicates: {summary.get('duplicates', 0)}")
|
||||
print(f" Failed: {summary.get('failed', 0)}")
|
||||
print(f" Total: {summary.get('total', 0)}")
|
||||
|
||||
failed_results = [r for r in result.get('results', []) if r.get('status') == 'failed']
|
||||
if failed_results:
|
||||
print(f"\nFailed URLs:")
|
||||
for r in failed_results[:10]:
|
||||
print(f" {r['url']}: {r.get('error', 'Unknown error')}")
|
||||
if len(failed_results) > 10:
|
||||
print(f" ... and {len(failed_results) - 10} more")
|
||||
|
||||
if args.enrich or args.process:
|
||||
print("\nRunning enrichment...")
|
||||
from lib.enricher import run_enrichment
|
||||
enriched = run_enrichment()
|
||||
print(f" Enriched: {enriched}")
|
||||
|
||||
if args.process:
|
||||
print("\nRunning embedding...")
|
||||
from lib.embedder import run_embedding
|
||||
embedded = run_embedding()
|
||||
print(f" Embedded: {embedded}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_validate(args):
|
||||
from scripts.validate import run_validation
|
||||
run_validation(deep=args.deep)
|
||||
|
|
@ -671,7 +604,6 @@ def cmd_service(args):
|
|||
Bundles: Flask dashboard + dispatcher + pipeline stages + filing worker + progress reporter.
|
||||
All threads are daemon threads; SIGTERM/SIGINT trigger graceful shutdown.
|
||||
"""
|
||||
from lib.extractor import run_extraction
|
||||
from lib.enricher import run_enrichment
|
||||
from lib.embedder import run_embedding
|
||||
from lib.api import app, run_server as start_dashboard
|
||||
|
|
@ -682,7 +614,6 @@ def cmd_service(args):
|
|||
proc = config.get('processing', {})
|
||||
svc = config.get('service', {})
|
||||
|
||||
extract_workers = proc.get('extract_workers', 4)
|
||||
enrich_workers = proc.get('enrich_workers', 16)
|
||||
embed_workers = proc.get('embed_workers', 4)
|
||||
poll_interval = svc.get('stage_poll_interval', 30)
|
||||
|
|
@ -693,7 +624,7 @@ def cmd_service(args):
|
|||
web_port = config.get('web', {}).get('port', 8420)
|
||||
|
||||
stop_event = threading.Event()
|
||||
totals = {'extract': 0, 'enrich': 0, 'embed': 0}
|
||||
totals = {'enrich': 0, 'embed': 0}
|
||||
|
||||
def shutdown(signum, frame):
|
||||
sig_name = signal.Signals(signum).name
|
||||
|
|
@ -746,8 +677,6 @@ def cmd_service(args):
|
|||
threads = [
|
||||
threading.Thread(target=lambda: dispatch_loop(stop_event, db, config, interval=dispatch_interval),
|
||||
daemon=True, name='dispatcher'),
|
||||
threading.Thread(target=stage_loop, daemon=True, name='extract',
|
||||
args=('extract', lambda: run_extraction(workers=extract_workers))),
|
||||
threading.Thread(target=stage_loop, daemon=True, name='enrich',
|
||||
args=('enrich', lambda: run_enrichment(workers=enrich_workers))),
|
||||
threading.Thread(target=stage_loop, daemon=True, name='embed',
|
||||
|
|
@ -761,7 +690,7 @@ def cmd_service(args):
|
|||
|
||||
logger.info("=== RECON Service Starting ===")
|
||||
logger.info(f" Dashboard: {web_host}:{web_port}")
|
||||
logger.info(f" Workers: extract={extract_workers}, enrich={enrich_workers}, embed={embed_workers}")
|
||||
logger.info(f" Workers: enrich={enrich_workers}, embed={embed_workers}")
|
||||
logger.info(f" Dispatcher: every {dispatch_interval}s | Filing: every {filing_interval}s")
|
||||
logger.info(f" Progress: every {progress_interval}s")
|
||||
|
||||
|
|
@ -1188,21 +1117,6 @@ def main():
|
|||
p.set_defaults(func=cmd_ingest_url)
|
||||
|
||||
# crawl
|
||||
p = sub.add_parser('crawl', help='Crawl a site and ingest discovered pages')
|
||||
p.add_argument('url', help='Base URL to crawl')
|
||||
p.add_argument('--category', default='Web', help='Category for ingested content')
|
||||
p.add_argument('--source', default=None, help='Source identifier (default: domain name)')
|
||||
p.add_argument('--include', nargs='+', help='Only include URL paths starting with these prefixes')
|
||||
p.add_argument('--exclude', nargs='+', help='Exclude URL paths starting with these prefixes')
|
||||
p.add_argument('--max-pages', type=int, default=500, help='Maximum pages to ingest')
|
||||
p.add_argument('--max-depth', type=int, default=3, help='Maximum link-follow depth')
|
||||
p.add_argument('--delay', type=float, default=1.0, help='Delay between page fetches (seconds)')
|
||||
p.add_argument('--dry-run', action='store_true', help='Discover URLs without ingesting')
|
||||
p.add_argument('--no-sitemap', action='store_true', help='Skip sitemap, use link-following only')
|
||||
p.add_argument('--enrich', action='store_true', help='Run enrichment after crawl')
|
||||
p.add_argument('--process', action='store_true', help='Full pipeline: crawl + enrich + embed')
|
||||
p.set_defaults(func=cmd_crawl)
|
||||
|
||||
# validate
|
||||
p = sub.add_parser('validate', help='Validate pipeline consistency')
|
||||
p.add_argument('--deep', action='store_true', help='Deep validation (check all files)')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue