#!/usr/bin/env python3
"""
aa_download.py — Anna's Archive bulk downloader for RECON library acquisition.

For each target book:
  1. Searches annas-archive.org for the title + author
  2. Extracts the best PDF match (verified by author/page count)
  3. Gets the MD5 from the book page
  4. Attempts download from Libgen mirrors in order
  5. Verifies downloaded file is a valid PDF
  6. Writes full acquisition report

Usage:
  python3 /opt/recon/scripts/aa_download.py [--dry-run] [--limit N]

Report output: ~/projects/recon/aa_acquisition_report.md
"""

import json
import time
import random
import hashlib
import logging
import argparse
from pathlib import Path
from datetime import datetime

import requests
from bs4 import BeautifulSoup

REPORT_PATH = Path.home() / "projects/recon/aa_acquisition_report.md"
LOG_FILE    = Path("/opt/recon/logs/aa_download.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[logging.FileHandler(LOG_FILE), logging.StreamHandler()]
)
log = logging.getLogger("aa_download")

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
    "Accept-Language": "en-US,en;q=0.9",
})

BASE_AA = "https://annas-archive.gl"

# Download attempt order — try fastest mirrors first
LIBGEN_MIRRORS = [
    "https://libgen.is/get.php?md5={md5}",
    "https://libgen.rs/get.php?md5={md5}",
    "https://libgen.st/get.php?md5={md5}",
    "https://libgen.li/ads.php?md5={md5}",
]

# ── Target book list ──────────────────────────────────────────────────────────
TARGETS = [
    # (title, author, dest_dir)

    # Medical — Herbalism
    ("Medical Herbalism",                          "David Hoffmann",             "Medical/Herbalism"),
    ("Making Plant Medicine",                      "Richo Cech",                 "Medical/Herbalism"),
    ("The Earthwise Herbal Volume 1",              "Matthew Wood",               "Medical/Herbalism"),
    ("The Earthwise Herbal Volume 2",              "Matthew Wood",               "Medical/Herbalism"),
    ("Herbal Antibiotics",                         "Stephen Buhner",             "Medical/Herbalism"),
    ("Herbal Antivirals",                          "Stephen Buhner",             "Medical/Herbalism"),
    ("The Herbal Medicine-Maker's Handbook",       "James Green",                "Medical/Herbalism"),
    ("Rosemary Gladstar's Medicinal Herbs",        "Rosemary Gladstar",          "Medical/Herbalism"),

    # Medical — Austere
    ("Wilderness Medicine",                        "Paul Auerbach",              "Medical/Austere"),
    ("Medicine for Mountaineering",                "James Wilkerson",            "Medical/Austere"),

    # Medical — Veterinary
    ("The Chicken Health Handbook",                "Gail Damerow",               "Medical/Veterinary"),
    ("Goat Husbandry",                             "David Mackenzie",            "Medical/Veterinary"),

    # Power Systems
    ("The Renewable Energy Handbook",              "William Kemp",               "Power"),
    ("Homebrew Wind Power",                        "Dan Bartmann",               "Power"),
    ("Wind Energy Basics",                         "Paul Gipe",                  "Power"),
    ("12-Volt Bible",                              "Brotherton",                 "Power"),
    ("Wiring a House",                             "Rex Cauldwell",              "Power"),

    # Navigation
    ("Wilderness Navigation",                      "Bob Burns",                  "Navigation"),
    ("Be Expert with Map and Compass",             "Bjorn Kjellstrom",           "Navigation"),
    ("Emergency Navigation",                       "David Burch",                "Navigation"),
    ("The Natural Navigator",                      "Tristan Gooley",             "Navigation"),
    ("The Essential Wilderness Navigator",         "David Seidman",              "Navigation"),

    # Water Systems
    ("Rainwater Harvesting for Drylands Volume 1", "Brad Lancaster",            "Water"),
    ("Rainwater Harvesting for Drylands Volume 2", "Brad Lancaster",            "Water"),
    ("Rainwater Harvesting for Drylands Volume 3", "Brad Lancaster",            "Water"),
    ("Water Storage",                              "Art Ludwig",                 "Water"),
    ("The Home Water Supply",                      "Stu Campbell",               "Water"),

    # Food Systems
    ("The Art of Fermentation",                    "Sandor Katz",                "Food"),
    ("Fermented Vegetables",                       "Kirsten Shockey",            "Food"),
    ("Mastering Artisan Cheesemaking",             "Gianaclis Caldwell",         "Food"),
    ("Home Cheese Making",                         "Ricki Carroll",              "Food"),
    ("The Art of Natural Cheesemaking",            "David Asher",                "Food"),

    # Permaculture
    ("Edible Forest Gardens Volume 1",             "Dave Jacke",                 "Permaculture"),
    ("Edible Forest Gardens Volume 2",             "Dave Jacke",                 "Permaculture"),
    ("Creating a Forest Garden",                   "Martin Crawford",            "Permaculture"),
    ("Sepp Holzer's Permaculture",                 "Sepp Holzer",                "Permaculture"),
    ("The Permaculture Handbook",                  "Peter Bane",                 "Permaculture"),
    ("The Market Gardener",                        "Jean-Martin Fortier",        "Permaculture"),

    # Scenario / Emergency
    ("SAS Survival Handbook",                      "John Wiseman",               "Scenario"),
    ("Pocket Ref",                                 "Thomas Glover",              "Scenario"),
    ("Deep Survival",                              "Laurence Gonzales",          "Scenario"),

    # Foundational Skills
    ("Back to Basics",                             "Reader's Digest",            "Skills"),
    ("A Pattern Language",                         "Christopher Alexander",      "Skills"),
]

BASE_LIB = Path("/mnt/library/Acquired")


def search_aa(title, author):
    """Search Anna's Archive and return list of candidate result dicts."""
    query = f"{title} {author}"
    url = f"{BASE_AA}/search"
    params = {"q": query, "ext": "pdf", "lang": "en"}
    try:
        r = SESSION.get(url, params=params, timeout=20)
        r.raise_for_status()
    except Exception as e:
        log.warning(f"Search failed for '{title}': {e}")
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    results = []

    seen_md5 = set()
    for item in soup.select("a[href^='/md5/']"):
        href = item.get("href", "")
        md5 = href.split("/md5/")[-1].split("/")[0].split("?")[0].strip()
        if not md5 or len(md5) != 32:
            continue
        text = item.get_text(" ", strip=True)
        if not text or md5 in seen_md5:
            continue
        seen_md5.add(md5)
        results.append({"md5": md5, "text": text, "href": href})
        if len(results) >= 5:
            break

    return results


def get_book_details(md5):
    """Fetch the book detail page and extract useful metadata."""
    url = f"{BASE_AA}/md5/{md5}"
    try:
        r = SESSION.get(url, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        text = soup.get_text(" ", strip=True)
        # Extract page count if visible
        pages = None
        for word in text.split():
            if word.isdigit() and 50 < int(word) < 5000:
                pages = int(word)
                break
        return {"pages": pages, "text": text[:500]}
    except Exception as e:
        log.warning(f"Detail fetch failed for md5={md5}: {e}")
        return {}


def try_download(md5, dest_path):
    """Try each libgen mirror until one works. Returns True on success."""
    for mirror_tpl in LIBGEN_MIRRORS:
        url = mirror_tpl.format(md5=md5)
        try:
            r = SESSION.get(url, timeout=60, stream=True, allow_redirects=True)
            content_type = r.headers.get("content-type", "")
            if r.status_code != 200:
                continue
            # Some mirrors return an HTML ads page before the real file
            if "text/html" in content_type:
                # Parse redirect link from ads page
                soup = BeautifulSoup(r.text, "html.parser")
                dl_link = soup.select_one("a[href*='.pdf']")
                if not dl_link:
                    dl_link = soup.select_one("a[href*='get.php']")
                if not dl_link:
                    continue
                actual_url = dl_link["href"]
                if not actual_url.startswith("http"):
                    actual_url = f"https://libgen.is{actual_url}"
                r = SESSION.get(actual_url, timeout=120, stream=True)
                if r.status_code != 200:
                    continue

            # Stream to disk
            dest_path.parent.mkdir(parents=True, exist_ok=True)
            with open(dest_path, "wb") as f:
                for chunk in r.iter_content(8192):
                    f.write(chunk)

            # Verify it's a real PDF
            with open(dest_path, "rb") as f:
                header = f.read(4)
            if header == b"%PDF":
                size_mb = dest_path.stat().st_size / 1024 / 1024
                log.info(f"  [OK] {dest_path.name} ({size_mb:.1f}MB) via {url}")
                return True
            else:
                log.warning(f"  [BAD] Not a PDF from {url}")
                dest_path.unlink(missing_ok=True)

        except Exception as e:
            log.warning(f"  Mirror failed {url}: {e}")
            continue

    return False


def process_book(title, author, subdir, dry_run):
    """Full search + download pipeline for one book."""
    log.info(f"[SEARCH] '{title}' — {author}")
    result = {
        "title": title,
        "author": author,
        "status": "NOT FOUND",
        "md5": "",
        "pages": "",
        "file": "",
        "notes": "",
    }

    candidates = search_aa(title, author)
    if not candidates:
        result["notes"] = "No results from AA search"
        return result

    # Pick best candidate — prefer one whose text contains author name
    best = None
    for c in candidates:
        if author.split()[-1].lower() in c["text"].lower():
            best = c
            break
    if not best:
        best = candidates[0]  # take first result if no author match

    md5 = best["md5"]
    result["md5"] = md5

    details = get_book_details(md5)
    result["pages"] = details.get("pages", "")

    if dry_run:
        result["status"] = "DRY RUN — found"
        result["notes"] = f"MD5: {md5}"
        return result

    # Build destination path
    safe_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)[:60]
    safe_author = author.split()[-1]
    filename = f"{safe_title}_{safe_author}.pdf"
    dest = BASE_LIB / subdir / filename

    if dest.exists():
        result["status"] = "ALREADY EXISTS"
        result["file"] = str(dest)
        return result

    log.info(f"  MD5: {md5} — attempting download...")
    ok = try_download(md5, dest)

    if ok:
        result["status"] = "DOWNLOADED"
        result["file"] = str(dest)
    else:
        result["status"] = "MD5 ONLY"
        result["notes"] = f"All mirrors failed. MD5: {md5}"

    return result


def write_report(results):
    REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
    downloaded   = [r for r in results if r["status"] == "DOWNLOADED"]
    md5_only     = [r for r in results if r["status"] == "MD5 ONLY"]
    not_found    = [r for r in results if r["status"] == "NOT FOUND"]
    already_have = [r for r in results if r["status"] == "ALREADY EXISTS"]

    lines = [
        f"# Anna's Archive Acquisition Report",
        f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        f"**Total searched:** {len(results)}",
        f"",
        f"| Status | Count |",
        f"|--------|-------|",
        f"| Downloaded | {len(downloaded)} |",
        f"| MD5 only (mirrors failed) | {len(md5_only)} |",
        f"| Not found on AA | {len(not_found)} |",
        f"| Already in library | {len(already_have)} |",
        f"",
    ]

    if downloaded:
        lines += ["## Downloaded", ""]
        lines += ["| Title | Author | Pages | File |", "|-------|--------|-------|------|"]
        for r in downloaded:
            lines.append(f"| {r['title']} | {r['author']} | {r['pages']} | `{Path(r['file']).name}` |")
        lines.append("")

    if md5_only:
        lines += ["## Found on AA — Download Failed (use MD5 for manual retrieval)", ""]
        lines += ["| Title | Author | MD5 | Notes |", "|-------|--------|-----|-------|"]
        for r in md5_only:
            lines.append(f"| {r['title']} | {r['author']} | `{r['md5']}` | {r['notes']} |")
        lines.append("")

    if not_found:
        lines += ["## Not Found on Anna's Archive", ""]
        lines += ["| Title | Author | Notes |", "|-------|--------|-------|"]
        for r in not_found:
            lines.append(f"| {r['title']} | {r['author']} | {r['notes']} |")
        lines.append("")

    if already_have:
        lines += ["## Already in Library", ""]
        lines += ["| Title | Author |", "|-------|--------|"]
        for r in already_have:
            lines.append(f"| {r['title']} | {r['author']} |")
        lines.append("")

    REPORT_PATH.write_text("\n".join(lines))
    log.info(f"Report written to {REPORT_PATH}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--limit", type=int, default=None)
    args = parser.parse_args()

    targets = TARGETS[:args.limit] if args.limit else TARGETS
    log.info(f"Starting AA acquisition: {len(targets)} books | dry_run={args.dry_run}")

    results = []
    for i, (title, author, subdir) in enumerate(targets, 1):
        log.info(f"[{i}/{len(targets)}]")
        result = process_book(title, author, subdir, args.dry_run)
        results.append(result)
        log.info(f"  -> {result['status']}")
        # Polite delay between requests
        time.sleep(random.uniform(8, 15))

    write_report(results)

    print(f"\n-- Summary -----------------------------------------------")
    for status in ["DOWNLOADED", "MD5 ONLY", "NOT FOUND", "ALREADY EXISTS", "DRY RUN — found"]:
        count = sum(1 for r in results if r["status"] == status)
        if count:
            print(f"  {status:<35} {count:>3}")
    print(f"  Report: {REPORT_PATH}")


if __name__ == "__main__":
    main()