recon/scripts/aa_download_pass2.py
Matt 563c16bb71 Initial commit: RECON codebase baseline
Current state of the pipeline code as of 2026-04-14 (Phase 1 scaffolding complete).
Config has new_pipeline.enabled=false and crawler.sites=[] per refactor plan.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-14 14:57:23 +00:00

478 lines
19 KiB
Python
Executable file

#!/usr/bin/env python3
"""
aa_download_pass2.py — Second-pass downloader for books that failed in pass 1.
Reads the MD5 list from pass 1 report and tries:
1. Z-Library search by title/author (separate catalog from Libgen)
2. IPFS gateways using AA's IPFS CID (different from MD5 but findable)
3. Alternative Libgen mirrors not tried in pass 1
4. Direct AA slow download with longer timeout + retry
Checkpoint: saves progress to /opt/recon/data/aa_pass2_checkpoint.json
so interrupted runs resume where they left off.
Usage:
python3 /opt/recon/scripts/aa_download_pass2.py [--dry-run]
"""
import json
import time
import random
import logging
import hashlib
import argparse
from pathlib import Path
from datetime import datetime
import requests
from bs4 import BeautifulSoup
LOG_FILE = Path("/opt/recon/logs/aa_download_pass2.log")
REPORT_IN = Path.home() / "projects/recon/aa_acquisition_report.md"
REPORT_OUT = Path.home() / "projects/recon/aa_acquisition_report_pass2.md"
CHECKPOINT = Path("/opt/recon/data/aa_pass2_checkpoint.json")
BASE_LIB = Path("/mnt/library/Acquired")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("aa_pass2")
SESSION = requests.Session()
SESSION.headers.update({
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
"Accept-Language": "en-US,en;q=0.9",
})
# ── Mirrors to try in order ───────────────────────────────────────────────────
MIRRORS = [
# Libgen alternatives
"https://libgen.li/ads.php?md5={md5}",
"https://library.lol/main/{md5}",
"https://libgen.rocks/get.php?md5={md5}",
# Z-Library direct MD5 endpoint (sometimes works)
"https://z-library.se/md5/{md5}",
# IPFS public gateways — AA uses IPFS for storage
"https://cloudflare-ipfs.com/ipfs/{md5}",
"https://ipfs.io/ipfs/{md5}",
"https://gateway.pinata.cloud/ipfs/{md5}",
]
# ── Books that failed in pass 1 — title, author, md5, subdir ─────────────────
PASS1_FAILURES = [
# Medical/Herbalism
("The Earthwise Herbal Volume 1", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
("The Earthwise Herbal Volume 2", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"),
("Herbal Antibiotics", "Stephen Buhner", "5839dab78edfdff0d7986fac62b814da", "Medical/Herbalism"),
("The Herbal Medicine-Maker's Handbook", "James Green", "27e8e8a3585705ed194029b69c7d61b1", "Medical/Herbalism"),
("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "9b1966f20a32ab4331bfece167be1dd0", "Medical/Herbalism"),
# Medical/Austere
("Wilderness Medicine", "Paul Auerbach", "957818eaa4ec40527bb05902f9ef7c51", "Medical/Austere"),
("Medicine for Mountaineering", "James Wilkerson", "39cb07998f2034206f0c9472e44cb0b4", "Medical/Austere"),
# Medical/Veterinary
("The Chicken Health Handbook", "Gail Damerow", "0ba42fbea034b9a08ec8e2f8d7606efe", "Medical/Veterinary"),
# Power
("The Renewable Energy Handbook", "William Kemp", "475d89fa80aea6c45aa4b1b4b9c5e274", "Power"),
("Homebrew Wind Power", "Dan Bartmann", "0578696d5b1b6bceb3e5e3302c1a31aa", "Power"),
("Wind Energy Basics", "Paul Gipe", "ccbe9d22e0a5e32d61921d20d66a8e05", "Power"),
("12-Volt Bible", "Brotherton", "3f964fa6d730fdf2c3d3e231e87cf692", "Power"),
("Wiring a House", "Rex Cauldwell", "5efcb53450e9eb560210eee40678adcf", "Power"),
# Navigation
("Emergency Navigation", "David Burch", "25e4def9e777b3fa9ca935134732ff9d", "Navigation"),
# Water
("Water Storage", "Art Ludwig", "17c965ec15c6cf4f09b5377b599a5266", "Water"),
("The Home Water Supply", "Stu Campbell", "9b22677d2f8e8b39f7a6bf032187295b", "Water"),
# Food
("Fermented Vegetables", "Kirsten Shockey", "74d3bde876b4c17be66c21fdfa85213e", "Food"),
("The Art of Natural Cheesemaking", "David Asher", "bc0e0829d701fea9beca912d39f8cc74", "Food"),
# Permaculture
("Edible Forest Gardens Volume 1", "Dave Jacke", "6b069c3bb077fdd89d487a363c070fbb", "Permaculture"),
("Edible Forest Gardens Volume 2", "Dave Jacke", "699255bfde7f69285c132a94ec291bf4", "Permaculture"),
("Creating a Forest Garden", "Martin Crawford", "96d71d70dba31ae86e14845f913e557e", "Permaculture"),
("Sepp Holzer's Permaculture", "Sepp Holzer", "32be55a9fce3e31cacd6912069abb410", "Permaculture"),
("The Permaculture Handbook", "Peter Bane", "08cb4492739fda4d01b5a868a408e4a0", "Permaculture"),
("The Market Gardener", "Jean-Martin Fortier", "ac69f6c8c22305b42b539482dc761c19", "Permaculture"),
# Scenario
("SAS Survival Handbook", "John Wiseman", "fa967fd5fcbeb3c9887e22f73e590c64", "Scenario"),
("Pocket Ref", "Thomas Glover", "8e4988ce513a4aa75e7e6c00ee36692b", "Scenario"),
("Deep Survival", "Laurence Gonzales", "9a907ab13b81ea597407fffdb8ea1b04", "Scenario"),
# Skills
("A Pattern Language", "Christopher Alexander","7f5cc06b5399b65a278c4005ccd8d476", "Skills"),
]
def load_checkpoint():
"""Load checkpoint: dict of {title: result_dict} for completed books."""
if CHECKPOINT.exists():
try:
return json.loads(CHECKPOINT.read_text())
except Exception:
pass
return {}
def save_checkpoint(completed):
"""Save checkpoint after each book."""
CHECKPOINT.parent.mkdir(parents=True, exist_ok=True)
tmp = str(CHECKPOINT) + ".tmp"
with open(tmp, "w") as f:
json.dump(completed, f, indent=2)
Path(tmp).replace(CHECKPOINT)
def load_md5s_from_report():
"""Parse MD5 hashes from pass 1 report to pre-populate PASS1_FAILURES."""
if not REPORT_IN.exists():
return {}
md5_map = {}
for line in REPORT_IN.read_text().splitlines():
if "`" in line and len(line) > 30:
parts = line.split("|")
if len(parts) >= 4:
title = parts[1].strip()
md5_cell = parts[3].strip().strip("`")
if len(md5_cell) == 32 and md5_cell.isalnum():
md5_map[title.lower()] = md5_cell
return md5_map
def search_zlib(title, author):
"""Try Z-Library search endpoint."""
try:
url = "https://z-library.se/s/"
params = {"q": f"{title} {author}", "extension[]": "pdf"}
r = SESSION.get(url, params=params, timeout=15)
if r.status_code != 200:
return None
soup = BeautifulSoup(r.text, "html.parser")
# Z-lib book links contain /book/
for a in soup.select("a[href*='/book/']")[:3]:
href = a.get("href", "")
if href:
book_url = f"https://z-library.se{href}" if href.startswith("/") else href
return book_url
except Exception as e:
log.debug(f"Zlib search failed: {e}")
return None
def try_zlib_download(book_url, dest_path):
"""Download from Z-Library book page."""
try:
r = SESSION.get(book_url, timeout=15)
soup = BeautifulSoup(r.text, "html.parser")
dl = soup.select_one("a.addDownloadedBook, a[href*='/dl/'], a.btn-primary[href*='download']")
if not dl:
return False
dl_url = dl["href"]
if not dl_url.startswith("http"):
dl_url = f"https://z-library.se{dl_url}"
r2 = SESSION.get(dl_url, timeout=120, stream=True)
if r2.status_code != 200:
return False
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r2.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"Zlib download failed: {e}")
return False
def try_mirrors(md5, dest_path):
"""Try all mirrors with the MD5."""
import re as _re
for tpl in MIRRORS:
url = tpl.format(md5=md5)
try:
r = SESSION.get(url, timeout=20, stream=True, allow_redirects=True)
if r.status_code != 200:
continue
ctype = r.headers.get("content-type", "")
if "html" in ctype:
soup = BeautifulSoup(r.text, "html.parser")
# For libgen.li ads page, look for get.php with key
dl = None
match = _re.search(r'href="(get\.php\?md5=[^"]+)"', r.text)
if match:
actual = f"https://libgen.li/{match.group(1)}"
else:
dl = (soup.select_one("a[href*='.pdf']") or
soup.select_one("a[href*='get.php']") or
soup.select_one("a[href*='/get/']"))
if not dl:
continue
actual = dl["href"]
if not actual.startswith("http"):
base = url.split("/")[0] + "//" + url.split("/")[2]
actual = base + ("/" if not actual.startswith("/") else "") + actual
r = SESSION.get(actual, timeout=60, stream=True)
if r.status_code != 200:
continue
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {size_mb:.1f}MB via {url}")
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"Mirror {url} failed: {e}")
time.sleep(2)
return False
def get_ipfs_cids(md5):
"""Fetch IPFS CIDs from AA book detail page."""
import re as _re
cids = []
try:
r = SESSION.get(f"https://annas-archive.gl/md5/{md5}", timeout=20)
if r.status_code == 200:
for m in _re.finditer(r'ipfs_cid[:\s]+([A-Za-z0-9]{46,})', r.text):
cids.append(m.group(1))
# Also check for CIDs in href attributes
for m in _re.finditer(r'ipfs://([A-Za-z0-9]{46,})', r.text):
if m.group(1) not in cids:
cids.append(m.group(1))
except Exception as e:
log.debug(f"IPFS CID fetch failed: {e}")
return cids
def try_ipfs_download(cids, dest_path):
"""Try downloading via IPFS public gateways."""
gateways = [
"https://cloudflare-ipfs.com/ipfs/{}",
"https://dweb.link/ipfs/{}",
]
for cid in cids[:3]: # limit to first 3 CIDs
for gw_tpl in gateways:
url = gw_tpl.format(cid)
try:
r = SESSION.get(url, timeout=15, stream=True)
if r.status_code != 200:
continue
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in r.iter_content(8192):
f.write(chunk)
with open(dest_path, "rb") as f:
if f.read(4) == b"%PDF":
size_mb = dest_path.stat().st_size / 1024 / 1024
log.info(f" [OK] {size_mb:.1f}MB via IPFS {url[:60]}...")
return True
dest_path.unlink(missing_ok=True)
except Exception as e:
log.debug(f"IPFS {url} failed: {e}")
time.sleep(1)
return False
def search_aa_fresh(title, author):
"""Fresh AA search on .gl domain for books that weren't found before."""
for domain in ["annas-archive.gl", "annas-archive.se", "annas-archive.org"]:
try:
url = f"https://{domain}/search"
params = {"q": f"{title} {author}", "ext": "pdf", "lang": "en"}
r = SESSION.get(url, params=params, timeout=15)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.select("a[href^='/md5/']"):
text = a.get_text(" ", strip=True)
if not text:
continue
md5 = a["href"].split("/md5/")[-1].split("/")[0].strip()
if len(md5) == 32:
if author.split()[-1].lower() in text.lower() or title.split()[0].lower() in text.lower():
return md5
except Exception:
continue
return None
def process_book(title, author, md5_hint, subdir, dry_run):
result = {
"title": title, "author": author,
"status": "NOT FOUND", "md5": md5_hint,
"file": "", "notes": "",
}
safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
safe_author = author.split()[-1]
dest = BASE_LIB / subdir / f"{safe_title}_{safe_author}.pdf"
if dest.exists():
result["status"] = "ALREADY EXISTS"
result["file"] = str(dest)
return result
if dry_run:
result["status"] = "DRY RUN"
return result
# 1. Try Z-Library first (different catalog)
log.info(f" Trying Z-Library...")
zlib_url = search_zlib(title, author)
if zlib_url:
if try_zlib_download(zlib_url, dest):
result["status"] = "DOWNLOADED (Z-Library)"
result["file"] = str(dest)
return result
# 2. If no MD5 from pass 1, do a fresh AA search
md5 = md5_hint
if not md5:
log.info(f" Searching AA for fresh MD5...")
md5 = search_aa_fresh(title, author)
if md5:
result["md5"] = md5
log.info(f" Found MD5: {md5}")
# 3. Try IPFS with real CIDs from AA detail page
if md5:
log.info(f" Fetching IPFS CIDs from AA...")
cids = get_ipfs_cids(md5)
if cids:
log.info(f" Found {len(cids)} IPFS CID(s), trying gateways...")
if try_ipfs_download(cids, dest):
result["status"] = "DOWNLOADED (IPFS)"
result["file"] = str(dest)
return result
# 4. Try all mirrors with MD5
if md5:
log.info(f" Trying mirrors with MD5 {md5}...")
if try_mirrors(md5, dest):
result["status"] = "DOWNLOADED (mirror)"
result["file"] = str(dest)
return result
result["status"] = "MD5 ONLY"
result["notes"] = f"MD5 confirmed, all mirrors failed: {md5}"
else:
result["notes"] = "Not found on AA or Z-Library"
return result
def write_report(results):
downloaded = [r for r in results if "DOWNLOADED" in r["status"]]
md5_only = [r for r in results if r["status"] == "MD5 ONLY"]
not_found = [r for r in results if r["status"] == "NOT FOUND"]
existing = [r for r in results if r["status"] == "ALREADY EXISTS"]
lines = [
"# AA Acquisition Report -- Pass 2",
f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Searched:** {len(results)} | **Downloaded:** {len(downloaded)} | "
f"**MD5 only:** {len(md5_only)} | **Not found:** {len(not_found)}",
"",
]
if downloaded:
lines += ["## Downloaded", "",
"| Title | Author | Via | File |",
"|-------|--------|-----|------|"]
for r in downloaded:
lines.append(f"| {r['title']} | {r['author']} | {r['status']} | `{Path(r['file']).name}` |")
lines.append("")
if existing:
lines += ["## Already in Library", "",
"| Title | Author |",
"|-------|--------|"]
for r in existing:
lines.append(f"| {r['title']} | {r['author']} |")
lines.append("")
if md5_only:
lines += ["## MD5 Known -- All Mirrors Failed", "",
"| Title | Author | MD5 |",
"|-------|--------|-----|"]
for r in md5_only:
lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` |")
lines.append("")
if not_found:
lines += ["## Not Found Anywhere", "",
"| Title | Author | Notes |",
"|-------|--------|-------|"]
for r in not_found:
lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
lines.append("")
REPORT_OUT.parent.mkdir(parents=True, exist_ok=True)
REPORT_OUT.write_text("\n".join(lines))
log.info(f"Report written to {REPORT_OUT}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
# Load any MD5s captured in pass 1
md5_map = load_md5s_from_report()
targets = []
for title, author, md5_hint, subdir in PASS1_FAILURES:
md5 = md5_hint or md5_map.get(title.lower(), "")
targets.append((title, author, md5, subdir))
# Load checkpoint
completed = load_checkpoint()
if completed:
log.info(f"Resuming: {len(completed)} books already processed in previous run")
log.info(f"Pass 2: {len(targets)} books | dry_run={args.dry_run}")
results = []
for i, (title, author, md5, subdir) in enumerate(targets, 1):
# Check checkpoint — skip already-processed books
if title in completed and not args.dry_run:
result = completed[title]
results.append(result)
log.info(f"[{i}/{len(targets)}] {title} — SKIPPED (checkpoint: {result['status']})")
continue
log.info(f"[{i}/{len(targets)}] {title} -- {author}")
result = process_book(title, author, md5, subdir, args.dry_run)
results.append(result)
log.info(f" -> {result['status']}")
# Save checkpoint after each book (not in dry-run)
if not args.dry_run:
completed[title] = result
save_checkpoint(completed)
time.sleep(random.uniform(6, 12))
write_report(results)
print(f"\n-- Pass 2 Summary ----------------------------------------")
for status in ["DOWNLOADED (Z-Library)", "DOWNLOADED (IPFS)", "DOWNLOADED (mirror)", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN"]:
count = sum(1 for r in results if r["status"] == status)
if count:
print(f" {status:<35} {count:>3}")
print(f" Report: {REPORT_OUT}")
if __name__ == "__main__":
main()