#!/usr/bin/env python3
"""
Polite 1-layer-deep scraper:
Fetch base URL, follow all links, extract <div id="msgBody"> sections,
convert to Markdown, and save each one. Includes delay & timeout to avoid blocking.
"""

import os
import re
import time
import random
import requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup
from markdownify import markdownify as md_convert

# ───────────────────────────────────────────────

# CONFIGURABLE SETTINGS
REQUEST_TIMEOUT = 15       # seconds per request before giving up
DELAY_BETWEEN_REQUESTS = (10, 20)  # range of delay (seconds) between requests
MAX_RETRIES = 2            # how many times to retry a failed request

# ───────────────────────────────────────────────

def sanitize_filename(s, max_len=60):
    s = re.sub(r'\s+', '_', s.strip())
    s = re.sub(r'[^A-Za-z0-9_\-\.]', '', s)
    return s[:max_len] or "post"

def polite_fetch(url):
    """Fetch a URL with retry and delay to avoid rate limiting."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"[+] Fetching ({attempt}/{MAX_RETRIES}): {url}")
            r = requests.get(url, timeout=REQUEST_TIMEOUT, headers={
                "User-Agent": "Mozilla/5.0 (compatible; polite-scraper/1.0)"
            })
            r.raise_for_status()
            # Randomized delay before returning
            delay = random.uniform(*DELAY_BETWEEN_REQUESTS)
            print(f"    Sleeping {delay:.1f}s to be polite...")
            time.sleep(delay)
            return r.text
        except requests.RequestException as e:
            print(f"[!] Error fetching {url}: {e}")
            if attempt < MAX_RETRIES:
                wait = 5 + attempt * 2
                print(f"    Retrying after {wait}s...")
                time.sleep(wait)
            else:
                print("    Skipping after max retries.")
                return None

def html_to_markdown(html):
    return md_convert(html, heading_style="ATX")

def extract_links(base_url, html):
    soup = BeautifulSoup(html, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        abs_url = urljoin(base_url, a["href"])
        abs_url, _ = urldefrag(abs_url)
        if abs_url.startswith("http"):
            links.add(abs_url)
    return links

def extract_msgbodies(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.find_all("div", class_="msgBody")

def save_markdown_file(path, content):
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)

def convert_divs_to_markdown(url, output_dir, used_names):
    html = polite_fetch(url)
    if not html:
        return
    msg_bodies = extract_msgbodies(html)
    if not msg_bodies:
        print("    (no msgBody divs found)")
        return

    for idx, div in enumerate(msg_bodies, start=1):
        text = div.get_text(separator=" ", strip=True)
        title = text[:20]
        base_name = sanitize_filename(title)
        if base_name in used_names:
            used_names[base_name] += 1
            base_name = f"{base_name}_{used_names[base_name]}"
        else:
            used_names[base_name] = 1
        filename = os.path.join(output_dir, f"{base_name}.md")

        markdown = html_to_markdown(str(div))
        save_markdown_file(filename, markdown)
        print(f"    Saved: {filename}")

def crawl_one_layer(base_url, output_dir="msgBodies"):
    os.makedirs(output_dir, exist_ok=True)
    used_names = {}

    print(f"[*] Starting polite crawl: {base_url}")
    base_html = polite_fetch(base_url)
    if not base_html:
        print("[!] Could not fetch base page.")
        return

    links = extract_links(base_url, base_html)
    pages = {base_url} | links
    print(f"[*] Found {len(links)} links. Processing (1 layer deep)...")

    try:
        for link in pages:
            print(f"[→] Processing {link}")
            convert_divs_to_markdown(link, output_dir, used_names)
    except KeyboardInterrupt:
        print("\n[!] Crawl interrupted by user.")

    print(f"[✓] Done. Markdown files saved in '{output_dir}'")

# ───────────────────────────────────────────────

if __name__ == "__main__":
    import argparse

    p = argparse.ArgumentParser(description="Polite 1-layer Wayback-safe scraper for <div id='msgBody'>.")
    p.add_argument("url", help="Starting URL to crawl")
    p.add_argument("--out", "-o", default="msgBodies", help="Output directory")
    args = p.parse_args()

    crawl_one_layer(args.url, args.out)
