mirror of
https://github.com/zvx-echo6/recon.git
synced 2026-05-20 14:44:54 +02:00
Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete). Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
563c16bb71
59 changed files with 18327 additions and 0 deletions
373
scripts/aa_download.py
Executable file
373
scripts/aa_download.py
Executable file
|
|
@ -0,0 +1,373 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
aa_download.py — Anna's Archive bulk downloader for RECON library acquisition.
|
||||
|
||||
For each target book:
|
||||
1. Searches annas-archive.org for the title + author
|
||||
2. Extracts the best PDF match (verified by author/page count)
|
||||
3. Gets the MD5 from the book page
|
||||
4. Attempts download from Libgen mirrors in order
|
||||
5. Verifies downloaded file is a valid PDF
|
||||
6. Writes full acquisition report
|
||||
|
||||
Usage:
|
||||
python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]
|
||||
|
||||
Report output: ~/projects/recon/aa_acquisition_report.md
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
|
||||
LOG_FILE = Path("/opt/recon/logs/aa_download.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
|
||||
)
|
||||
log = logging.getLogger("aa_download")
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
})
|
||||
|
||||
BASE_AA = "https://annas-archive.gl"
|
||||
|
||||
# Download attempt order — try fastest mirrors first
|
||||
LIBGEN_MIRRORS = [
|
||||
"https://libgen.is/get.php?md5={md5}",
|
||||
"https://libgen.rs/get.php?md5={md5}",
|
||||
"https://libgen.st/get.php?md5={md5}",
|
||||
"https://libgen.li/ads.php?md5={md5}",
|
||||
]
|
||||
|
||||
# ── Target book list ──────────────────────────────────────────────────────────
|
||||
TARGETS = [
|
||||
# (title, author, dest_dir)
|
||||
|
||||
# Medical — Herbalism
|
||||
("Medical Herbalism", "David Hoffmann", "Medical/Herbalism"),
|
||||
("Making Plant Medicine", "Richo Cech", "Medical/Herbalism"),
|
||||
("The Earthwise Herbal Volume 1", "Matthew Wood", "Medical/Herbalism"),
|
||||
("The Earthwise Herbal Volume 2", "Matthew Wood", "Medical/Herbalism"),
|
||||
("Herbal Antibiotics", "Stephen Buhner", "Medical/Herbalism"),
|
||||
("Herbal Antivirals", "Stephen Buhner", "Medical/Herbalism"),
|
||||
("The Herbal Medicine-Maker's Handbook", "James Green", "Medical/Herbalism"),
|
||||
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "Medical/Herbalism"),
|
||||
|
||||
# Medical — Austere
|
||||
("Wilderness Medicine", "Paul Auerbach", "Medical/Austere"),
|
||||
("Medicine for Mountaineering", "James Wilkerson", "Medical/Austere"),
|
||||
|
||||
# Medical — Veterinary
|
||||
("The Chicken Health Handbook", "Gail Damerow", "Medical/Veterinary"),
|
||||
("Goat Husbandry", "David Mackenzie", "Medical/Veterinary"),
|
||||
|
||||
# Power Systems
|
||||
("The Renewable Energy Handbook", "William Kemp", "Power"),
|
||||
("Homebrew Wind Power", "Dan Bartmann", "Power"),
|
||||
("Wind Energy Basics", "Paul Gipe", "Power"),
|
||||
("12-Volt Bible", "Brotherton", "Power"),
|
||||
("Wiring a House", "Rex Cauldwell", "Power"),
|
||||
|
||||
# Navigation
|
||||
("Wilderness Navigation", "Bob Burns", "Navigation"),
|
||||
("Be Expert with Map and Compass", "Bjorn Kjellstrom", "Navigation"),
|
||||
("Emergency Navigation", "David Burch", "Navigation"),
|
||||
("The Natural Navigator", "Tristan Gooley", "Navigation"),
|
||||
("The Essential Wilderness Navigator", "David Seidman", "Navigation"),
|
||||
|
||||
# Water Systems
|
||||
("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster", "Water"),
|
||||
("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster", "Water"),
|
||||
("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster", "Water"),
|
||||
("Water Storage", "Art Ludwig", "Water"),
|
||||
("The Home Water Supply", "Stu Campbell", "Water"),
|
||||
|
||||
# Food Systems
|
||||
("The Art of Fermentation", "Sandor Katz", "Food"),
|
||||
("Fermented Vegetables", "Kirsten Shockey", "Food"),
|
||||
("Mastering Artisan Cheesemaking", "Gianaclis Caldwell", "Food"),
|
||||
("Home Cheese Making", "Ricki Carroll", "Food"),
|
||||
("The Art of Natural Cheesemaking", "David Asher", "Food"),
|
||||
|
||||
# Permaculture
|
||||
("Edible Forest Gardens Volume 1", "Dave Jacke", "Permaculture"),
|
||||
("Edible Forest Gardens Volume 2", "Dave Jacke", "Permaculture"),
|
||||
("Creating a Forest Garden", "Martin Crawford", "Permaculture"),
|
||||
("Sepp Holzer's Permaculture", "Sepp Holzer", "Permaculture"),
|
||||
("The Permaculture Handbook", "Peter Bane", "Permaculture"),
|
||||
("The Market Gardener", "Jean-Martin Fortier", "Permaculture"),
|
||||
|
||||
# Scenario / Emergency
|
||||
("SAS Survival Handbook", "John Wiseman", "Scenario"),
|
||||
("Pocket Ref", "Thomas Glover", "Scenario"),
|
||||
("Deep Survival", "Laurence Gonzales", "Scenario"),
|
||||
|
||||
# Foundational Skills
|
||||
("Back to Basics", "Reader's Digest", "Skills"),
|
||||
("A Pattern Language", "Christopher Alexander", "Skills"),
|
||||
]
|
||||
|
||||
BASE_LIB = Path("/mnt/library/Acquired")
|
||||
|
||||
|
||||
def search_aa(title, author):
|
||||
"""Search Anna's Archive and return list of candidate result dicts."""
|
||||
query = f"{title} {author}"
|
||||
url = f"{BASE_AA}/search"
|
||||
params = {"q": query, "ext": "pdf", "lang": "en"}
|
||||
try:
|
||||
r = SESSION.get(url, params=params, timeout=20)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
log.warning(f"Search failed for '{title}': {e}")
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
results = []
|
||||
|
||||
seen_md5 = set()
|
||||
for item in soup.select("a[href^='/md5/']"):
|
||||
href = item.get("href", "")
|
||||
md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
|
||||
if not md5 or len(md5) != 32:
|
||||
continue
|
||||
text = item.get_text(" ", strip=True)
|
||||
if not text or md5 in seen_md5:
|
||||
continue
|
||||
seen_md5.add(md5)
|
||||
results.append({"md5": md5, "text": text, "href": href})
|
||||
if len(results) >= 5:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_book_details(md5):
|
||||
"""Fetch the book detail page and extract useful metadata."""
|
||||
url = f"{BASE_AA}/md5/{md5}"
|
||||
try:
|
||||
r = SESSION.get(url, timeout=20)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
text = soup.get_text(" ", strip=True)
|
||||
# Extract page count if visible
|
||||
pages = None
|
||||
for word in text.split():
|
||||
if word.isdigit() and 50 < int(word) < 5000:
|
||||
pages = int(word)
|
||||
break
|
||||
return {"pages": pages, "text": text[:500]}
|
||||
except Exception as e:
|
||||
log.warning(f"Detail fetch failed for md5={md5}: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def try_download(md5, dest_path):
|
||||
"""Try each libgen mirror until one works. Returns True on success."""
|
||||
for mirror_tpl in LIBGEN_MIRRORS:
|
||||
url = mirror_tpl.format(md5=md5)
|
||||
try:
|
||||
r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
|
||||
content_type = r.headers.get("content-type", "")
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
# Some mirrors return an HTML ads page before the real file
|
||||
if "text/html" in content_type:
|
||||
# Parse redirect link from ads page
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
dl_link = soup.select_one("a[href*='.pdf']")
|
||||
if not dl_link:
|
||||
dl_link = soup.select_one("a[href*='get.php']")
|
||||
if not dl_link:
|
||||
continue
|
||||
actual_url = dl_link["href"]
|
||||
if not actual_url.startswith("http"):
|
||||
actual_url = f"https://libgen.is{actual_url}"
|
||||
r = SESSION.get(actual_url, timeout=120, stream=True)
|
||||
if r.status_code != 200:
|
||||
continue
|
||||
|
||||
# Stream to disk
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(dest_path, "wb") as f:
|
||||
for chunk in r.iter_content(8192):
|
||||
f.write(chunk)
|
||||
|
||||
# Verify it's a real PDF
|
||||
with open(dest_path, "rb") as f:
|
||||
header = f.read(4)
|
||||
if header == b"%PDF":
|
||||
size_mb = dest_path.stat().st_size / 1024 / 1024
|
||||
log.info(f" [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
|
||||
return True
|
||||
else:
|
||||
log.warning(f" [BAD] Not a PDF from {url}")
|
||||
dest_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f" Mirror failed {url}: {e}")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def process_book(title, author, subdir, dry_run):
|
||||
"""Full search + download pipeline for one book."""
|
||||
log.info(f"[SEARCH] '{title}' — {author}")
|
||||
result = {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"status": "NOT FOUND",
|
||||
"md5": "",
|
||||
"pages": "",
|
||||
"file": "",
|
||||
"notes": "",
|
||||
}
|
||||
|
||||
candidates = search_aa(title, author)
|
||||
if not candidates:
|
||||
result["notes"] = "No results from AA search"
|
||||
return result
|
||||
|
||||
# Pick best candidate — prefer one whose text contains author name
|
||||
best = None
|
||||
for c in candidates:
|
||||
if author.split()[-1].lower() in c["text"].lower():
|
||||
best = c
|
||||
break
|
||||
if not best:
|
||||
best = candidates[0] # take first result if no author match
|
||||
|
||||
md5 = best["md5"]
|
||||
result["md5"] = md5
|
||||
|
||||
details = get_book_details(md5)
|
||||
result["pages"] = details.get("pages", "")
|
||||
|
||||
if dry_run:
|
||||
result["status"] = "DRY RUN — found"
|
||||
result["notes"] = f"MD5: {md5}"
|
||||
return result
|
||||
|
||||
# Build destination path
|
||||
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
|
||||
safe_author = author.split()[-1]
|
||||
filename = f"{safe_title}_{safe_author}.pdf"
|
||||
dest = BASE_LIB / subdir / filename
|
||||
|
||||
if dest.exists():
|
||||
result["status"] = "ALREADY EXISTS"
|
||||
result["file"] = str(dest)
|
||||
return result
|
||||
|
||||
log.info(f" MD5: {md5} — attempting download...")
|
||||
ok = try_download(md5, dest)
|
||||
|
||||
if ok:
|
||||
result["status"] = "DOWNLOADED"
|
||||
result["file"] = str(dest)
|
||||
else:
|
||||
result["status"] = "MD5 ONLY"
|
||||
result["notes"] = f"All mirrors failed. MD5: {md5}"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def write_report(results):
|
||||
REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = [r for r in results if r["status"] == "DOWNLOADED"]
|
||||
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
|
||||
not_found = [r for r in results if r["status"] == "NOT FOUND"]
|
||||
already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]
|
||||
|
||||
lines = [
|
||||
f"# Anna's Archive Acquisition Report",
|
||||
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
|
||||
f"**Total searched:** {len(results)}",
|
||||
f"",
|
||||
f"| Status | Count |",
|
||||
f"|--------|-------|",
|
||||
f"| Downloaded | {len(downloaded)} |",
|
||||
f"| MD5 only (mirrors failed) | {len(md5_only)} |",
|
||||
f"| Not found on AA | {len(not_found)} |",
|
||||
f"| Already in library | {len(already_have)} |",
|
||||
f"",
|
||||
]
|
||||
|
||||
if downloaded:
|
||||
lines += ["## Downloaded", ""]
|
||||
lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
|
||||
for r in downloaded:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
|
||||
lines.append("")
|
||||
|
||||
if md5_only:
|
||||
lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
|
||||
lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
|
||||
for r in md5_only:
|
||||
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
|
||||
lines.append("")
|
||||
|
||||
if not_found:
|
||||
lines += ["## Not Found on Anna's Archive", ""]
|
||||
lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
|
||||
for r in not_found:
|
||||
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
|
||||
lines.append("")
|
||||
|
||||
if already_have:
|
||||
lines += ["## Already in Library", ""]
|
||||
lines += ["| Title | Author |", "|-------|--------|"]
|
||||
for r in already_have:
|
||||
lines.append(f"| {r['title']} | {r['author']} |")
|
||||
lines.append("")
|
||||
|
||||
REPORT_PATH.write_text("\n".join(lines))
|
||||
log.info(f"Report written to {REPORT_PATH}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--limit", type=int, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
targets = TARGETS[:args.limit] if args.limit else TARGETS
|
||||
log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")
|
||||
|
||||
results = []
|
||||
for i, (title, author, subdir) in enumerate(targets, 1):
|
||||
log.info(f"[{i}/{len(targets)}]")
|
||||
result = process_book(title, author, subdir, args.dry_run)
|
||||
results.append(result)
|
||||
log.info(f" -> {result['status']}")
|
||||
# Polite delay between requests
|
||||
time.sleep(random.uniform(8, 15))
|
||||
|
||||
write_report(results)
|
||||
|
||||
print(f"\n-- Summary -----------------------------------------------")
|
||||
for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
|
||||
count = sum(1 for r in results if r["status"] == status)
|
||||
if count:
|
||||
print(f" {status:<35} {count:>3}")
|
||||
print(f" Report: {REPORT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue