#!/usr/bin/env python3
"""
Fetch a webpage, extract each <article>, convert to Markdown (preserving links & styling),
and save each as a separate .md file named from the first 20 characters of the article.
A free tool
"""

import os
import re
import hashlib
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

# Try to import HTML -> Markdown converters. Prefer markdownify, fall back to html2text.
try:
    from markdownify import markdownify as md_convert
    _CONVERTER = "markdownify"
except Exception:
    try:
        import html2text
        _h2t = html2text.HTML2Text()
        # Configure html2text: keep links inline, don't wrap lines too early
        _h2t.ignore_images = False
        _h2t.body_width = 0
        _CONVERTER = "html2text"
    except Exception:
        _CONVERTER = None


def ensure_converter_available():
    if _CONVERTER:
        return
    raise RuntimeError(
        "No HTML->Markdown converter found. Install one of:\n"
        "  pip install markdownify\n  or\n"
        "  pip install html2text\n"
    )


def sanitize_filename(s, max_len=60):
    # Replace path-breaking characters; trim whitespace, collapse spaces.
    s = s.strip()
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^A-Za-z0-9_\-\.]', '', s)
    return s[:max_len] or "article"


def html_to_markdown(html):
    """
    Convert HTML string to markdown using available converter.
    """
    ensure_converter_available()
    if _CONVERTER == "markdownify":
        # markdownify has options; basic call preserves links, images, emphasis
        return md_convert(html, heading_style="ATX")
    elif _CONVERTER == "html2text":
        return _h2t.handle(html)
    else:
        # Shouldn't reach here
        return BeautifulSoup(html, "html.parser").get_text("\n", strip=True)


def make_absolute_urls(soup, base_url):
    """
    Convert relative href and src in the soup to absolute URLs using base_url.
    Modifies soup in place.
    """
    # anchors
    for a in soup.find_all("a", href=True):
        a["href"] = urljoin(base_url, a["href"])
    # images
    for img in soup.find_all("img", src=True):
        img["src"] = urljoin(base_url, img["src"])
    # sources (e.g. <source src="..."> in <video>/<audio>)
    for tag in soup.find_all(src=True):
        if tag.name not in ("img",):
            tag["src"] = urljoin(base_url, tag["src"])
    # forms or other tags using action
    for form in soup.find_all("form", action=True):
        form["action"] = urljoin(base_url, form["action"])


def unique_filename(base, used):
    """
    Ensure filename unique by adding _1, _2... or a short hash if needed.
    'used' is a dict mapping base->count
    """
    if base not in used:
        used[base] = 1
        return f"{base}.md"
    else:
        used[base] += 1
        return f"{base}_{used[base]}.md"


def save_markdown_file(path, content):
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)


def optional_download_image(img_url, images_dir):
    """
    Downloads image at img_url into images_dir and returns local relative path.
    If download fails, returns original img_url.
    """
    try:
        os.makedirs(images_dir, exist_ok=True)
        r = requests.get(img_url, stream=True, timeout=15)
        r.raise_for_status()
        # create filename from url hash + original extension if present
        url_hash = hashlib.sha1(img_url.encode("utf-8")).hexdigest()[:10]
        ext_match = re.search(r'\.([a-zA-Z0-9]{1,6})(?:$|\?)', img_url)
        ext = f".{ext_match.group(1)}" if ext_match else ""
        fname = f"img_{url_hash}{ext}"
        filepath = os.path.join(images_dir, fname)
        with open(filepath, "wb") as out:
            for chunk in r.iter_content(8192):
                out.write(chunk)
        return os.path.join(os.path.basename(images_dir), fname)
    except Exception:
        return img_url  # fallback to remote URL


def fetch_articles_to_markdown(url, output_dir="articles", download_images=False):
    """
    Main function:
    - url: page to fetch
    - output_dir: where to place markdown files (and optionally images/)
    - download_images: whether to download images referenced by articles and rewrite image URLs to local files
    """
    ensure_converter_available()
    os.makedirs(output_dir, exist_ok=True)

    resp = requests.get(url, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    articles = soup.find_all("article")
    if not articles:
        print("No <article> elements found on the page.")
        return

    used_names = {}
    for idx, article in enumerate(articles, start=1):
        # Make links/images absolute
        make_absolute_urls(article, url)

        # Optional: download images and rewrite <img> src to local relative path
        if download_images:
            images_dir = os.path.join(output_dir, "images")
            for img in article.find_all("img", src=True):
                original_src = img["src"]
                local_path = optional_download_image(original_src, images_dir)
                img["src"] = local_path

        # Extract textual snippet (first 20 characters of visible text) for filename
        visible_text = article.get_text(separator=" ", strip=True)
        title_snippet = visible_text[:20]
        # sanitize and fallback
        base_name = sanitize_filename(title_snippet)
        # add index to be safe: keep user's requirement that base is first 20 chars but avoid collisions
        filename = unique_filename(base_name, used_names)
        filepath = os.path.join(output_dir, filename)

        # Convert the article's inner HTML (preserve markup) to markdown
        # get inner HTML (contents of <article>)
        inner_html = "".join(str(c) for c in article.contents)
        markdown = html_to_markdown(inner_html)

        final_text = markdown


        save_markdown_file(filepath, final_text)
        print(f"Saved: {filepath}")

    print(f"Done — {len(articles)} article(s) processed into '{output_dir}'.")


if __name__ == "__main__":
    import argparse

    p = argparse.ArgumentParser(description="Fetch webpage and convert <article> tags to Markdown files.")
    p.add_argument("url", help="URL of the page to fetch")
    p.add_argument("--out", "-o", default="articles", help="Output directory (default: articles)")
    p.add_argument("--download-images", action="store_true", help="Download article images into <out>/images and rewrite links")
    args = p.parse_args()

    try:
        fetch_articles_to_markdown(args.url, args.out, args.download_images)
    except Exception as e:
        print("Error:", str(e))
        raise