recon/lib/crawler.py
Matt 563c16bb71 Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 14:57:23 +00:00

432 lines
13 KiB
Python

"""
RECON Site Crawler — URL discovery for bulk web ingestion.
Two discovery strategies:
1. Sitemap-based (preferred) — parses sitemap.xml for all URLs
2. Link-following (fallback) — crawls from root URL following internal links
Discovered URLs are fed into web_scraper.ingest_url() for processing.
"""
import re
import time
from collections import deque
from urllib.parse import urlparse, urljoin, urldefrag
import requests
from lxml import etree
from .utils import get_config, setup_logging
logger = setup_logging('recon.crawler')
def _get_crawler_config(config=None):
"""Load crawler config with defaults."""
if config is None:
config = get_config()
crawler_cfg = config.get('crawler', {})
web_cfg = config.get('web_scraper', {})
return {
'user_agent': (
crawler_cfg.get('user_agent') or
web_cfg.get('user_agent') or
'Mozilla/5.0 (compatible; RECON/1.0)'
),
'fetch_timeout': crawler_cfg.get('fetch_timeout', 30),
'rate_limit_delay': crawler_cfg.get('rate_limit_delay', 1.0),
'max_pages': crawler_cfg.get('max_pages', 500),
'max_depth': crawler_cfg.get('max_depth', 3),
'default_exclude': crawler_cfg.get('default_exclude', [
'/search', '/404', '/login', '/signup', '/auth/', '/api/', '/assets/', '/static/'
]),
}
# ─── Sitemap Discovery ─────────────────────────────────────────────
def discover_sitemap_url(base_url, config=None):
"""
Find the sitemap URL for a site.
Checks: robots.txt Sitemap: directive, /sitemap.xml,
/sitemap_index.xml, /sitemap-0.xml.
Returns sitemap URL or None.
"""
cfg = _get_crawler_config(config)
headers = {'User-Agent': cfg['user_agent']}
parsed = urlparse(base_url)
root = f"{parsed.scheme}://{parsed.netloc}"
# Check robots.txt first
try:
resp = requests.get(
f"{root}/robots.txt",
headers=headers,
timeout=cfg['fetch_timeout']
)
if resp.status_code == 200:
for line in resp.text.splitlines():
if line.strip().lower().startswith('sitemap:'):
sitemap_url = line.split(':', 1)[1].strip()
# Handle "Sitemap: https://..." — split(':',1) keeps the URL intact
# but "Sitemap: https://..." splits into "Sitemap" and " https://..."
# Need to rejoin properly
if not sitemap_url.startswith('http'):
sitemap_url = line[line.index(':') + 1:].strip()
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
return sitemap_url
except Exception as e:
logger.debug(f"robots.txt fetch failed: {e}")
# Try common sitemap locations
candidates = [
f"{root}/sitemap.xml",
f"{root}/sitemap_index.xml",
f"{root}/sitemap-0.xml",
]
for url in candidates:
try:
resp = requests.head(
url,
headers=headers,
timeout=cfg['fetch_timeout'],
allow_redirects=True
)
if resp.status_code == 200:
logger.info(f"Found sitemap at: {url}")
return url
except Exception:
continue
logger.warning(f"No sitemap found for {base_url}")
return None
def parse_sitemap(sitemap_url, config=None):
"""
Parse a sitemap XML and return all page URLs.
Handles standard sitemaps (<urlset>) and sitemap indexes
(<sitemapindex>) with recursive sub-sitemap fetching.
"""
cfg = _get_crawler_config(config)
headers = {'User-Agent': cfg['user_agent']}
all_urls = []
def _fetch_and_parse(url, depth=0):
if depth > 3:
return
try:
resp = requests.get(url, headers=headers, timeout=cfg['fetch_timeout'])
resp.raise_for_status()
except Exception as e:
logger.error(f"Failed to fetch sitemap {url}: {e}")
return
try:
root = etree.fromstring(resp.content)
except etree.XMLSyntaxError as e:
logger.error(f"Invalid XML in sitemap {url}: {e}")
return
nsmap = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# Check if this is a sitemap index
sitemap_locs = root.findall('.//ns:sitemap/ns:loc', nsmap)
if sitemap_locs:
logger.info(f"Sitemap index at {url}{len(sitemap_locs)} sub-sitemaps")
for loc in sitemap_locs:
if loc.text:
_fetch_and_parse(loc.text.strip(), depth + 1)
return
# Standard sitemap — extract URLs
url_locs = root.findall('.//ns:loc', nsmap)
# Fallback: try without namespace
if not url_locs:
url_locs = root.findall('.//loc')
for loc in url_locs:
if loc.text:
all_urls.append(loc.text.strip())
logger.info(f"Parsed {len(url_locs)} URLs from {url}")
_fetch_and_parse(sitemap_url)
# Deduplicate preserving order
seen = set()
unique = []
for url in all_urls:
url_clean = urldefrag(url)[0]
if url_clean not in seen:
seen.add(url_clean)
unique.append(url_clean)
logger.info(f"Total unique URLs from sitemap: {len(unique)}")
return unique
# ─── Link-Following Discovery (Fallback) ───────────────────────────
def crawl_links(base_url, max_depth=3, max_pages=500, config=None):
"""
Discover URLs by following internal links (BFS).
Fallback when no sitemap is available.
"""
from bs4 import BeautifulSoup
cfg = _get_crawler_config(config)
headers = {'User-Agent': cfg['user_agent']}
parsed_base = urlparse(base_url)
base_domain = parsed_base.netloc
discovered = []
visited = set()
queue = deque([(base_url, 0)])
skip_extensions = (
'.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg',
'.css', '.js', '.zip', '.tar', '.gz', '.mp4', '.mp3',
'.ico', '.woff', '.woff2', '.ttf', '.eot',
)
skip_paths = (
'/tag/', '/tags/', '/page/', '/feed/', '/rss/',
'/wp-json/', '/wp-admin/', '/wp-includes/',
)
while queue and len(discovered) < max_pages:
url, depth = queue.popleft()
url = urldefrag(url)[0]
if url in visited:
continue
if depth > max_depth:
continue
visited.add(url)
discovered.append(url)
if depth >= max_depth:
continue
try:
resp = requests.get(url, headers=headers, timeout=cfg['fetch_timeout'])
if resp.status_code != 200:
continue
if 'text/html' not in resp.headers.get('content-type', ''):
continue
except Exception:
continue
try:
soup = BeautifulSoup(resp.text, 'lxml')
except Exception:
continue
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(url, href)
full_url = urldefrag(full_url)[0]
parsed = urlparse(full_url)
if parsed.netloc != base_domain:
continue
if any(parsed.path.lower().endswith(ext) for ext in skip_extensions):
continue
if any(skip in parsed.path.lower() for skip in skip_paths):
continue
if full_url not in visited:
queue.append((full_url, depth + 1))
time.sleep(cfg['rate_limit_delay'])
logger.info(f"Link crawl: {len(discovered)} URLs (visited {len(visited)}, depth {max_depth})")
return discovered
# ─── URL Filtering ──────────────────────────────────────────────────
def filter_urls(urls, include=None, exclude=None):
"""
Filter URLs by path prefix include/exclude rules.
include: URL must match at least one prefix (if provided)
exclude: URL must not match any prefix
"""
filtered = []
for url in urls:
path = urlparse(url).path
if include:
if not any(path.startswith(prefix) for prefix in include):
continue
if exclude:
if any(path.startswith(prefix) for prefix in exclude):
continue
filtered.append(url)
logger.info(f"Filtered {len(urls)} -> {len(filtered)} URLs "
f"(include={include}, exclude={exclude})")
return filtered
# ─── Main Crawl Orchestrator ────────────────────────────────────────
def crawl_site(
base_url,
category='Web',
source=None,
include=None,
exclude=None,
max_pages=None,
max_depth=None,
delay=None,
dry_run=False,
use_sitemap=True,
use_links=True,
config=None,
):
"""
Crawl a site and ingest all discovered pages.
1. Discover URLs via sitemap or link-following
2. Apply include/exclude filters
3. Feed each URL through web_scraper.ingest_url()
Returns summary dict with counts and per-URL results.
"""
if config is None:
config = get_config()
cfg = _get_crawler_config(config)
if max_pages is None:
max_pages = cfg['max_pages']
if max_depth is None:
max_depth = cfg['max_depth']
if delay is None:
delay = cfg['rate_limit_delay']
if source is None:
source = urlparse(base_url).netloc
logger.info(f"Crawling {base_url} (category={category}, max_pages={max_pages})")
# ── Phase 1: Discover URLs ──
urls = []
discovery_method = None
if use_sitemap:
sitemap_url = discover_sitemap_url(base_url, config)
if sitemap_url:
urls = parse_sitemap(sitemap_url, config)
discovery_method = 'sitemap'
if not urls and use_links:
logger.info("No sitemap URLs, falling back to link crawl...")
urls = crawl_links(base_url, max_depth=max_depth, max_pages=max_pages, config=config)
discovery_method = 'link_crawl'
if not urls:
logger.warning(f"No URLs discovered for {base_url}")
return {
'site': base_url,
'discovery_method': None,
'urls_discovered': 0,
'urls_after_filter': 0,
'results': [],
'summary': {'total': 0, 'succeeded': 0, 'duplicates': 0, 'failed': 0},
}
# ── Phase 2: Filter URLs ──
all_exclude = list(cfg['default_exclude'])
if exclude:
all_exclude.extend(exclude)
urls = filter_urls(urls, include=include, exclude=all_exclude)
if len(urls) > max_pages:
logger.info(f"Limiting to {max_pages} pages (discovered {len(urls)})")
urls = urls[:max_pages]
logger.info(f"After filtering: {len(urls)} URLs to process")
# ── Dry run ──
if dry_run:
return {
'site': base_url,
'discovery_method': discovery_method,
'dry_run': True,
'urls_discovered': len(urls),
'urls': urls,
}
# ── Phase 3: Ingest each URL ──
from .web_scraper import ingest_url
results = []
total = len(urls)
for i, url in enumerate(urls, 1):
logger.info(f"[{i}/{total}] Ingesting: {url}")
try:
result = ingest_url(url, category=category, source=source, config=config)
result['url'] = url
results.append(result)
status = result.get('status', 'unknown')
title = result.get('title', '')
if status == 'duplicate':
logger.info(f" DUPLICATE: {title}")
else:
logger.info(f" OK: {title} ({result.get('page_count', 0)} pages)")
except Exception as e:
logger.error(f" FAILED: {url} -- {e}")
results.append({
'url': url,
'status': 'failed',
'error': str(e),
})
if i < total and delay > 0:
time.sleep(delay)
# ── Summary ──
succeeded = sum(1 for r in results if r.get('status') not in ('failed', 'duplicate'))
duplicates = sum(1 for r in results if r.get('status') == 'duplicate')
failed = sum(1 for r in results if r.get('status') == 'failed')
summary = {
'total': len(results),
'succeeded': succeeded,
'duplicates': duplicates,
'failed': failed,
}
logger.info(f"Crawl complete: {succeeded} new, {duplicates} duplicates, {failed} failed out of {total}")
return {
'site': base_url,
'domain': urlparse(base_url).netloc,
'category': category,
'discovery_method': discovery_method,
'urls_discovered': total,
'results': results,
'summary': summary,
}