#!/usr/bin/env python3 """ aa_download_pass2.py — Second-pass downloader for books that failed in pass 1. Reads the MD5 list from pass 1 report and tries: 1. Z-Library search by title/author (separate catalog from Libgen) 2. IPFS gateways using AA's IPFS CID (different from MD5 but findable) 3. Alternative Libgen mirrors not tried in pass 1 4. Direct AA slow download with longer timeout + retry Checkpoint: saves progress to /opt/recon/data/aa_pass2_checkpoint.json so interrupted runs resume where they left off. Usage: python3 /opt/recon/scripts/aa_download_pass2.py [--dry-run] """ import json import time import random import logging import hashlib import argparse from pathlib import Path from datetime import datetime import requests from bs4 import BeautifulSoup LOG_FILE = Path("/opt/recon/logs/aa_download_pass2.log") REPORT_IN = Path.home() / "projects/recon/aa_acquisition_report.md" REPORT_OUT = Path.home() / "projects/recon/aa_acquisition_report_pass2.md" CHECKPOINT = Path("/opt/recon/data/aa_pass2_checkpoint.json") BASE_LIB = Path("/mnt/library/Acquired") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()] ) log = logging.getLogger("aa_pass2") SESSION = requests.Session() SESSION.headers.update({ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0", "Accept-Language": "en-US,en;q=0.9", }) # ── Mirrors to try in order ─────────────────────────────────────────────────── MIRRORS = [ # Libgen alternatives "https://libgen.li/ads.php?md5={md5}", "https://library.lol/main/{md5}", "https://libgen.rocks/get.php?md5={md5}", # Z-Library direct MD5 endpoint (sometimes works) "https://z-library.se/md5/{md5}", # IPFS public gateways — AA uses IPFS for storage "https://cloudflare-ipfs.com/ipfs/{md5}", "https://ipfs.io/ipfs/{md5}", "https://gateway.pinata.cloud/ipfs/{md5}", ] # ── Books that failed in pass 1 — title, author, md5, subdir ───────────────── PASS1_FAILURES = [ # Medical/Herbalism ("The Earthwise Herbal Volume 1", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"), ("The Earthwise Herbal Volume 2", "Matthew Wood", "fc8dc19f5a17f38849a3979830dc95c1", "Medical/Herbalism"), ("Herbal Antibiotics", "Stephen Buhner", "5839dab78edfdff0d7986fac62b814da", "Medical/Herbalism"), ("The Herbal Medicine-Maker's Handbook", "James Green", "27e8e8a3585705ed194029b69c7d61b1", "Medical/Herbalism"), ("Rosemary Gladstar's Medicinal Herbs", "Rosemary Gladstar", "9b1966f20a32ab4331bfece167be1dd0", "Medical/Herbalism"), # Medical/Austere ("Wilderness Medicine", "Paul Auerbach", "957818eaa4ec40527bb05902f9ef7c51", "Medical/Austere"), ("Medicine for Mountaineering", "James Wilkerson", "39cb07998f2034206f0c9472e44cb0b4", "Medical/Austere"), # Medical/Veterinary ("The Chicken Health Handbook", "Gail Damerow", "0ba42fbea034b9a08ec8e2f8d7606efe", "Medical/Veterinary"), # Power ("The Renewable Energy Handbook", "William Kemp", "475d89fa80aea6c45aa4b1b4b9c5e274", "Power"), ("Homebrew Wind Power", "Dan Bartmann", "0578696d5b1b6bceb3e5e3302c1a31aa", "Power"), ("Wind Energy Basics", "Paul Gipe", "ccbe9d22e0a5e32d61921d20d66a8e05", "Power"), ("12-Volt Bible", "Brotherton", "3f964fa6d730fdf2c3d3e231e87cf692", "Power"), ("Wiring a House", "Rex Cauldwell", "5efcb53450e9eb560210eee40678adcf", "Power"), # Navigation ("Emergency Navigation", "David Burch", "25e4def9e777b3fa9ca935134732ff9d", "Navigation"), # Water ("Water Storage", "Art Ludwig", "17c965ec15c6cf4f09b5377b599a5266", "Water"), ("The Home Water Supply", "Stu Campbell", "9b22677d2f8e8b39f7a6bf032187295b", "Water"), # Food ("Fermented Vegetables", "Kirsten Shockey", "74d3bde876b4c17be66c21fdfa85213e", "Food"), ("The Art of Natural Cheesemaking", "David Asher", "bc0e0829d701fea9beca912d39f8cc74", "Food"), # Permaculture ("Edible Forest Gardens Volume 1", "Dave Jacke", "6b069c3bb077fdd89d487a363c070fbb", "Permaculture"), ("Edible Forest Gardens Volume 2", "Dave Jacke", "699255bfde7f69285c132a94ec291bf4", "Permaculture"), ("Creating a Forest Garden", "Martin Crawford", "96d71d70dba31ae86e14845f913e557e", "Permaculture"), ("Sepp Holzer's Permaculture", "Sepp Holzer", "32be55a9fce3e31cacd6912069abb410", "Permaculture"), ("The Permaculture Handbook", "Peter Bane", "08cb4492739fda4d01b5a868a408e4a0", "Permaculture"), ("The Market Gardener", "Jean-Martin Fortier", "ac69f6c8c22305b42b539482dc761c19", "Permaculture"), # Scenario ("SAS Survival Handbook", "John Wiseman", "fa967fd5fcbeb3c9887e22f73e590c64", "Scenario"), ("Pocket Ref", "Thomas Glover", "8e4988ce513a4aa75e7e6c00ee36692b", "Scenario"), ("Deep Survival", "Laurence Gonzales", "9a907ab13b81ea597407fffdb8ea1b04", "Scenario"), # Skills ("A Pattern Language", "Christopher Alexander","7f5cc06b5399b65a278c4005ccd8d476", "Skills"), ] def load_checkpoint(): """Load checkpoint: dict of {title: result_dict} for completed books.""" if CHECKPOINT.exists(): try: return json.loads(CHECKPOINT.read_text()) except Exception: pass return {} def save_checkpoint(completed): """Save checkpoint after each book.""" CHECKPOINT.parent.mkdir(parents=True, exist_ok=True) tmp = str(CHECKPOINT) + ".tmp" with open(tmp, "w") as f: json.dump(completed, f, indent=2) Path(tmp).replace(CHECKPOINT) def load_md5s_from_report(): """Parse MD5 hashes from pass 1 report to pre-populate PASS1_FAILURES.""" if not REPORT_IN.exists(): return {} md5_map = {} for line in REPORT_IN.read_text().splitlines(): if "`" in line and len(line) > 30: parts = line.split("|") if len(parts) >= 4: title = parts[1].strip() md5_cell = parts[3].strip().strip("`") if len(md5_cell) == 32 and md5_cell.isalnum(): md5_map[title.lower()] = md5_cell return md5_map def search_zlib(title, author): """Try Z-Library search endpoint.""" try: url = "https://z-library.se/s/" params = {"q": f"{title} {author}", "extension[]": "pdf"} r = SESSION.get(url, params=params, timeout=15) if r.status_code != 200: return None soup = BeautifulSoup(r.text, "html.parser") # Z-lib book links contain /book/ for a in soup.select("a[href*='/book/']")[:3]: href = a.get("href", "") if href: book_url = f"https://z-library.se{href}" if href.startswith("/") else href return book_url except Exception as e: log.debug(f"Zlib search failed: {e}") return None def try_zlib_download(book_url, dest_path): """Download from Z-Library book page.""" try: r = SESSION.get(book_url, timeout=15) soup = BeautifulSoup(r.text, "html.parser") dl = soup.select_one("a.addDownloadedBook, a[href*='/dl/'], a.btn-primary[href*='download']") if not dl: return False dl_url = dl["href"] if not dl_url.startswith("http"): dl_url = f"https://z-library.se{dl_url}" r2 = SESSION.get(dl_url, timeout=120, stream=True) if r2.status_code != 200: return False dest_path.parent.mkdir(parents=True, exist_ok=True) with open(dest_path, "wb") as f: for chunk in r2.iter_content(8192): f.write(chunk) with open(dest_path, "rb") as f: if f.read(4) == b"%PDF": return True dest_path.unlink(missing_ok=True) except Exception as e: log.debug(f"Zlib download failed: {e}") return False def try_mirrors(md5, dest_path): """Try all mirrors with the MD5.""" import re as _re for tpl in MIRRORS: url = tpl.format(md5=md5) try: r = SESSION.get(url, timeout=20, stream=True, allow_redirects=True) if r.status_code != 200: continue ctype = r.headers.get("content-type", "") if "html" in ctype: soup = BeautifulSoup(r.text, "html.parser") # For libgen.li ads page, look for get.php with key dl = None match = _re.search(r'href="(get\.php\?md5=[^"]+)"', r.text) if match: actual = f"https://libgen.li/{match.group(1)}" else: dl = (soup.select_one("a[href*='.pdf']") or soup.select_one("a[href*='get.php']") or soup.select_one("a[href*='/get/']")) if not dl: continue actual = dl["href"] if not actual.startswith("http"): base = url.split("/")[0] + "//" + url.split("/")[2] actual = base + ("/" if not actual.startswith("/") else "") + actual r = SESSION.get(actual, timeout=60, stream=True) if r.status_code != 200: continue dest_path.parent.mkdir(parents=True, exist_ok=True) with open(dest_path, "wb") as f: for chunk in r.iter_content(8192): f.write(chunk) with open(dest_path, "rb") as f: if f.read(4) == b"%PDF": size_mb = dest_path.stat().st_size / 1024 / 1024 log.info(f" [OK] {size_mb:.1f}MB via {url}") return True dest_path.unlink(missing_ok=True) except Exception as e: log.debug(f"Mirror {url} failed: {e}") time.sleep(2) return False def get_ipfs_cids(md5): """Fetch IPFS CIDs from AA book detail page.""" import re as _re cids = [] try: r = SESSION.get(f"https://annas-archive.gl/md5/{md5}", timeout=20) if r.status_code == 200: for m in _re.finditer(r'ipfs_cid[:\s]+([A-Za-z0-9]{46,})', r.text): cids.append(m.group(1)) # Also check for CIDs in href attributes for m in _re.finditer(r'ipfs://([A-Za-z0-9]{46,})', r.text): if m.group(1) not in cids: cids.append(m.group(1)) except Exception as e: log.debug(f"IPFS CID fetch failed: {e}") return cids def try_ipfs_download(cids, dest_path): """Try downloading via IPFS public gateways.""" gateways = [ "https://cloudflare-ipfs.com/ipfs/{}", "https://dweb.link/ipfs/{}", ] for cid in cids[:3]: # limit to first 3 CIDs for gw_tpl in gateways: url = gw_tpl.format(cid) try: r = SESSION.get(url, timeout=15, stream=True) if r.status_code != 200: continue dest_path.parent.mkdir(parents=True, exist_ok=True) with open(dest_path, "wb") as f: for chunk in r.iter_content(8192): f.write(chunk) with open(dest_path, "rb") as f: if f.read(4) == b"%PDF": size_mb = dest_path.stat().st_size / 1024 / 1024 log.info(f" [OK] {size_mb:.1f}MB via IPFS {url[:60]}...") return True dest_path.unlink(missing_ok=True) except Exception as e: log.debug(f"IPFS {url} failed: {e}") time.sleep(1) return False def search_aa_fresh(title, author): """Fresh AA search on .gl domain for books that weren't found before.""" for domain in ["annas-archive.gl", "annas-archive.se", "annas-archive.org"]: try: url = f"https://{domain}/search" params = {"q": f"{title} {author}", "ext": "pdf", "lang": "en"} r = SESSION.get(url, params=params, timeout=15) if r.status_code != 200: continue soup = BeautifulSoup(r.text, "html.parser") for a in soup.select("a[href^='/md5/']"): text = a.get_text(" ", strip=True) if not text: continue md5 = a["href"].split("/md5/")[-1].split("/")[0].strip() if len(md5) == 32: if author.split()[-1].lower() in text.lower() or title.split()[0].lower() in text.lower(): return md5 except Exception: continue return None def process_book(title, author, md5_hint, subdir, dry_run): result = { "title": title, "author": author, "status": "NOT FOUND", "md5": md5_hint, "file": "", "notes": "", } safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60] safe_author = author.split()[-1] dest = BASE_LIB / subdir / f"{safe_title}_{safe_author}.pdf" if dest.exists(): result["status"] = "ALREADY EXISTS" result["file"] = str(dest) return result if dry_run: result["status"] = "DRY RUN" return result # 1. Try Z-Library first (different catalog) log.info(f" Trying Z-Library...") zlib_url = search_zlib(title, author) if zlib_url: if try_zlib_download(zlib_url, dest): result["status"] = "DOWNLOADED (Z-Library)" result["file"] = str(dest) return result # 2. If no MD5 from pass 1, do a fresh AA search md5 = md5_hint if not md5: log.info(f" Searching AA for fresh MD5...") md5 = search_aa_fresh(title, author) if md5: result["md5"] = md5 log.info(f" Found MD5: {md5}") # 3. Try IPFS with real CIDs from AA detail page if md5: log.info(f" Fetching IPFS CIDs from AA...") cids = get_ipfs_cids(md5) if cids: log.info(f" Found {len(cids)} IPFS CID(s), trying gateways...") if try_ipfs_download(cids, dest): result["status"] = "DOWNLOADED (IPFS)" result["file"] = str(dest) return result # 4. Try all mirrors with MD5 if md5: log.info(f" Trying mirrors with MD5 {md5}...") if try_mirrors(md5, dest): result["status"] = "DOWNLOADED (mirror)" result["file"] = str(dest) return result result["status"] = "MD5 ONLY" result["notes"] = f"MD5 confirmed, all mirrors failed: {md5}" else: result["notes"] = "Not found on AA or Z-Library" return result def write_report(results): downloaded = [r for r in results if "DOWNLOADED" in r["status"]] md5_only = [r for r in results if r["status"] == "MD5 ONLY"] not_found = [r for r in results if r["status"] == "NOT FOUND"] existing = [r for r in results if r["status"] == "ALREADY EXISTS"] lines = [ "# AA Acquisition Report -- Pass 2", f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}", f"**Searched:** {len(results)} | **Downloaded:** {len(downloaded)} | " f"**MD5 only:** {len(md5_only)} | **Not found:** {len(not_found)}", "", ] if downloaded: lines += ["## Downloaded", "", "| Title | Author | Via | File |", "|-------|--------|-----|------|"] for r in downloaded: lines.append(f"| {r['title']} | {r['author']} | {r['status']} | `{Path(r['file']).name}` |") lines.append("") if existing: lines += ["## Already in Library", "", "| Title | Author |", "|-------|--------|"] for r in existing: lines.append(f"| {r['title']} | {r['author']} |") lines.append("") if md5_only: lines += ["## MD5 Known -- All Mirrors Failed", "", "| Title | Author | MD5 |", "|-------|--------|-----|"] for r in md5_only: lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` |") lines.append("") if not_found: lines += ["## Not Found Anywhere", "", "| Title | Author | Notes |", "|-------|--------|-------|"] for r in not_found: lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |") lines.append("") REPORT_OUT.parent.mkdir(parents=True, exist_ok=True) REPORT_OUT.write_text("\n".join(lines)) log.info(f"Report written to {REPORT_OUT}") def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() # Load any MD5s captured in pass 1 md5_map = load_md5s_from_report() targets = [] for title, author, md5_hint, subdir in PASS1_FAILURES: md5 = md5_hint or md5_map.get(title.lower(), "") targets.append((title, author, md5, subdir)) # Load checkpoint completed = load_checkpoint() if completed: log.info(f"Resuming: {len(completed)} books already processed in previous run") log.info(f"Pass 2: {len(targets)} books | dry_run={args.dry_run}") results = [] for i, (title, author, md5, subdir) in enumerate(targets, 1): # Check checkpoint — skip already-processed books if title in completed and not args.dry_run: result = completed[title] results.append(result) log.info(f"[{i}/{len(targets)}] {title} — SKIPPED (checkpoint: {result['status']})") continue log.info(f"[{i}/{len(targets)}] {title} -- {author}") result = process_book(title, author, md5, subdir, args.dry_run) results.append(result) log.info(f" -> {result['status']}") # Save checkpoint after each book (not in dry-run) if not args.dry_run: completed[title] = result save_checkpoint(completed) time.sleep(random.uniform(6, 12)) write_report(results) print(f"\n-- Pass 2 Summary ----------------------------------------") for status in ["DOWNLOADED (Z-Library)", "DOWNLOADED (IPFS)", "DOWNLOADED (mirror)", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN"]: count = sum(1 for r in results if r["status"] == status) if count: print(f" {status:<35} {count:>3}") print(f" Report: {REPORT_OUT}") if __name__ == "__main__": main()