Initial commit: RECON codebase baseline

Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-04-14 14:57:23 +00:00
commit 563c16bb71
59 changed files with 18327 additions and 0 deletions

373
scripts/aa_download.py Executable file
View file

@ -0,0 +1,373 @@
#!/usr/bin/env python3
"""
aa_download.py Anna's Archive bulk downloader for RECON library acquisition.
For each target book:
1. Searches annas-archive.org for the title + author
2. Extracts the best PDF match (verified by author/page count)
3. Gets the MD5 from the book page
4. Attempts download from Libgen mirrors in order
5. Verifies downloaded file is a valid PDF
6. Writes full acquisition report
Usage:
python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]
Report output: ~/projects/recon/aa_acquisition_report.md
"""
import json
import time
import random
import hashlib
import logging
import argparse
from pathlib import Path
from datetime import datetime
import requests
from bs4 import BeautifulSoup
REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
LOG_FILE = Path("/opt/recon/logs/aa_download.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("aa_download")
SESSION = requests.Session()
SESSION.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Accept-Language": "en-US,en;q=0.9",
})
BASE_AA = "https://annas-archive.gl"
# Download attempt order — try fastest mirrors first
LIBGEN_MIRRORS = [
"https://libgen.is/get.php?md5={md5}",
"https://libgen.rs/get.php?md5={md5}",
"https://libgen.st/get.php?md5={md5}",
"https://libgen.li/ads.php?md5={md5}",
]
# ── Target book list ──────────────────────────────────────────────────────────
TARGETS = [
# (title, author, dest_dir)
# Medical — Herbalism
("Medical Herbalism", "David Hoffmann", "Medical/Herbalism"),
("Making Plant Medicine", "Richo Cech", "Medical/Herbalism"),
("The Earthwise Herbal Volume 1", "Matthew Wood", "Medical/Herbalism"),
("The Earthwise Herbal Volume 2", "Matthew Wood", "Medical/Herbalism"),
("Herbal Antibiotics", "Stephen Buhner", "Medical/Herbalism"),
("Herbal Antivirals", "Stephen Buhner", "Medical/Herbalism"),
("The Herbal Medicine-Maker's Handbook", "James Green", "Medical/Herbalism"),
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "Medical/Herbalism"),
# Medical — Austere
("Wilderness Medicine", "Paul Auerbach", "Medical/Austere"),
("Medicine for Mountaineering", "James Wilkerson", "Medical/Austere"),
# Medical — Veterinary
("The Chicken Health Handbook", "Gail Damerow", "Medical/Veterinary"),
("Goat Husbandry", "David Mackenzie", "Medical/Veterinary"),
# Power Systems
("The Renewable Energy Handbook", "William Kemp", "Power"),
("Homebrew Wind Power", "Dan Bartmann", "Power"),
("Wind Energy Basics", "Paul Gipe", "Power"),
("12-Volt Bible", "Brotherton", "Power"),
("Wiring a House", "Rex Cauldwell", "Power"),
# Navigation
("Wilderness Navigation", "Bob Burns", "Navigation"),
("Be Expert with Map and Compass", "Bjorn Kjellstrom", "Navigation"),
("Emergency Navigation", "David Burch", "Navigation"),
("The Natural Navigator", "Tristan Gooley", "Navigation"),
("The Essential Wilderness Navigator", "David Seidman", "Navigation"),
# Water Systems
("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster", "Water"),
("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster", "Water"),
("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster", "Water"),
("Water Storage", "Art Ludwig", "Water"),
("The Home Water Supply", "Stu Campbell", "Water"),
# Food Systems
("The Art of Fermentation", "Sandor Katz", "Food"),
("Fermented Vegetables", "Kirsten Shockey", "Food"),
("Mastering Artisan Cheesemaking", "Gianaclis Caldwell", "Food"),
("Home Cheese Making", "Ricki Carroll", "Food"),
("The Art of Natural Cheesemaking", "David Asher", "Food"),
# Permaculture
("Edible Forest Gardens Volume 1", "Dave Jacke", "Permaculture"),
("Edible Forest Gardens Volume 2", "Dave Jacke", "Permaculture"),
("Creating a Forest Garden", "Martin Crawford", "Permaculture"),
("Sepp Holzer's Permaculture", "Sepp Holzer", "Permaculture"),
("The Permaculture Handbook", "Peter Bane", "Permaculture"),
("The Market Gardener", "Jean-Martin Fortier", "Permaculture"),
# Scenario / Emergency
("SAS Survival Handbook", "John Wiseman", "Scenario"),
("Pocket Ref", "Thomas Glover", "Scenario"),
("Deep Survival", "Laurence Gonzales", "Scenario"),
# Foundational Skills
("Back to Basics", "Reader's Digest", "Skills"),
("A Pattern Language", "Christopher Alexander", "Skills"),
]
BASE_LIB = Path("/mnt/library/Acquired")
def search_aa(title, author):
"""Search Anna's Archive and return list of candidate result dicts."""
query = f"{title} {author}"
url = f"{BASE_AA}/search"
params = {"q": query, "ext": "pdf", "lang": "en"}
try:
r = SESSION.get(url, params=params, timeout=20)
r.raise_for_status()
except Exception as e:
log.warning(f"Search failed for '{title}': {e}")
return []
soup = BeautifulSoup(r.text, "html.parser")
results = []
seen_md5 = set()
for item in soup.select("a[href^='/md5/']"):
href = item.get("href", "")
md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
if not md5 or len(md5) != 32:
continue
text = item.get_text(" ", strip=True)
if not text or md5 in seen_md5:
continue
seen_md5.add(md5)
results.append({"md5": md5, "text": text, "href": href})
if len(results) >= 5:
break
return results
def get_book_details(md5):
"""Fetch the book detail page and extract useful metadata."""
url = f"{BASE_AA}/md5/{md5}"
try:
r = SESSION.get(url, timeout=20)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(" ", strip=True)
# Extract page count if visible
pages = None
for word in text.split():
if word.isdigit() and 50 < int(word) < 5000:
pages = int(word)
break
return {"pages": pages, "text": text[:500]}
except Exception as e:
log.warning(f"Detail fetch failed for md5={md5}: {e}")
return {}
def try_download(md5, dest_path):
"""Try each libgen mirror until one works. Returns True on success."""
for mirror_tpl in LIBGEN_MIRRORS:
url = mirror_tpl.format(md5=md5)
try:
r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
content_type = r.headers.get("content-type", "")
if r.status_code != 200:
continue
# Some mirrors return an HTML ads page before the real file
if "text/html" in content_type:
# Parse redirect link from ads page
soup = BeautifulSoup(r.text, "html.parser")
dl_link = soup.select_one("a[href*='.pdf']")
if not dl_link:
dl_link = soup.select_one("a[href*='get.php']")
if not dl_link:
continue
actual_url = dl_link["href"]
if not actual_url.startswith("http"):
actual_url = f"https://libgen.is{actual_url}"
r = SESSION.get(actual_url, timeout=120, stream=True)
if r.status_code != 200:
continue
# Stream to disk
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
# Verify it's a real PDF
with open(dest_path, "rb") as f:
header = f.read(4)
if header == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
return True
else:
log.warning(f" [BAD] Not a PDF from {url}")
dest_path.unlink(missing_ok=True)
except Exception as e:
log.warning(f" Mirror failed {url}: {e}")
continue
return False
def process_book(title, author, subdir, dry_run):
"""Full search + download pipeline for one book."""
log.info(f"[SEARCH] '{title}'{author}")
result = {
"title": title,
"author": author,
"status": "NOT FOUND",
"md5": "",
"pages": "",
"file": "",
"notes": "",
}
candidates = search_aa(title, author)
if not candidates:
result["notes"] = "No results from AA search"
return result
# Pick best candidate — prefer one whose text contains author name
best = None
for c in candidates:
if author.split()[-1].lower() in c["text"].lower():
best = c
break
if not best:
best = candidates[0] # take first result if no author match
md5 = best["md5"]
result["md5"] = md5
details = get_book_details(md5)
result["pages"] = details.get("pages", "")
if dry_run:
result["status"] = "DRY RUN — found"
result["notes"] = f"MD5: {md5}"
return result
# Build destination path
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
safe_author = author.split()[-1]
filename = f"{safe_title}_{safe_author}.pdf"
dest = BASE_LIB / subdir / filename
if dest.exists():
result["status"] = "ALREADY EXISTS"
result["file"] = str(dest)
return result
log.info(f" MD5: {md5} — attempting download...")
ok = try_download(md5, dest)
if ok:
result["status"] = "DOWNLOADED"
result["file"] = str(dest)
else:
result["status"] = "MD5 ONLY"
result["notes"] = f"All mirrors failed. MD5: {md5}"
return result
def write_report(results):
REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
downloaded = [r for r in results if r["status"] == "DOWNLOADED"]
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
not_found = [r for r in results if r["status"] == "NOT FOUND"]
already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]
lines = [
f"# Anna's Archive Acquisition Report",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Total searched:** {len(results)}",
f"",
f"| Status | Count |",
f"|--------|-------|",
f"| Downloaded | {len(downloaded)} |",
f"| MD5 only (mirrors failed) | {len(md5_only)} |",
f"| Not found on AA | {len(not_found)} |",
f"| Already in library | {len(already_have)} |",
f"",
]
if downloaded:
lines += ["## Downloaded", ""]
lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
for r in downloaded:
lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
lines.append("")
if md5_only:
lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
for r in md5_only:
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
lines.append("")
if not_found:
lines += ["## Not Found on Anna's Archive", ""]
lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
for r in not_found:
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
lines.append("")
if already_have:
lines += ["## Already in Library", ""]
lines += ["| Title | Author |", "|-------|--------|"]
for r in already_have:
lines.append(f"| {r['title']} | {r['author']} |")
lines.append("")
REPORT_PATH.write_text("\n".join(lines))
log.info(f"Report written to {REPORT_PATH}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
targets = TARGETS[:args.limit] if args.limit else TARGETS
log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")
results = []
for i, (title, author, subdir) in enumerate(targets, 1):
log.info(f"[{i}/{len(targets)}]")
result = process_book(title, author, subdir, args.dry_run)
results.append(result)
log.info(f" -> {result['status']}")
# Polite delay between requests
time.sleep(random.uniform(8, 15))
write_report(results)
print(f"\n-- Summary -----------------------------------------------")
for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
count = sum(1 for r in results if r["status"] == status)
if count:
print(f" {status:<35} {count:>3}")
print(f" Report: {REPORT_PATH}")
if __name__ == "__main__":
main()