#!/usr/bin/env python3
"""
SEC Stats — D1Baseball Rankings Scraper
Fetches the current D1Baseball Top 25 and writes rankings.json
to the stats JSON output folder so the frontend can display them.

Usage:
  python3 scrape_rankings.py                          # write to json/baseball/
  python3 scrape_rankings.py --out json/baseball      # explicit output path
  python3 scrape_rankings.py --dry-run                # print rankings, don't write

Designed to be run weekly (Monday morning) after D1Baseball updates their poll.
Add to a cron job or run manually after the weekly poll drops.
"""

import json
import re
import sys
import os
import argparse
import urllib.request
from datetime import datetime
from pathlib import Path

OUT_DIR = Path("json/baseball")

# D1Baseball rankings page — the main rankings listing
# Rankings are embedded in the page HTML as a structured table
RANKINGS_URL = "https://d1baseball.com/rankings/"

# NCAA.com also hosts the D1Baseball poll in a table format
NCAA_URL = "https://www.ncaa.com/rankings/baseball/d1/d1baseballcom-top-25"

# Map D1Baseball team names → our StatCrew team names
# Only needed for teams whose names differ between sources
NAME_MAP = {
    "LSU":              "LSU",
    "Alabama":          "Alabama",
    "Arkansas":         "Arkansas",
    "Auburn":           "Auburn",
    "Florida":          "Florida",
    "Georgia":          "Georgia",
    "Kentucky":         "Kentucky",
    "Ole Miss":         "Ole Miss",
    "Mississippi State":"Mississippi State",
    "Missouri":         "Missouri",
    "Oklahoma":         "Oklahoma",
    "South Carolina":   "South Carolina",
    "Tennessee":        "Tennessee",
    "Texas":            "Texas",
    "Texas A&M":        "Texas A&M",
    "Vanderbilt":       "Vanderbilt",
}


def fetch_url(url, timeout=15):
    """Fetch a URL and return the response text."""
    req = urllib.request.Request(url, headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml",
    })
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read().decode("utf-8", errors="replace")


def parse_ncaa_rankings(html):
    """
    Parse rankings from NCAA.com rankings page.
    The page renders a table: Rank | Team | Record | Points
    """
    rankings = {}

    # Try to find table rows with rank + team name
    # Pattern: <td>1</td>...<td>Team Name</td>
    # NCAA.com table structure varies — try multiple patterns

    # Pattern 1: ranked list items with school name
    pattern1 = re.findall(
        r'<td[^>]*>\s*(\d+)\s*</td>\s*<td[^>]*>.*?([A-Z][a-zA-Z .&\']+(?:\s[A-Z][a-zA-Z .&\']+)*)\s*</td>',
        html, re.DOTALL
    )

    for rank_str, name in pattern1:
        rank = int(rank_str)
        name = name.strip()
        if 1 <= rank <= 25 and len(name) > 2:
            rankings[name] = rank

    return rankings


def parse_d1baseball_rankings(html):
    """
    Parse rankings from D1Baseball rankings page.
    Rankings are in a table or list format.
    """
    rankings = {}

    # Pattern: "1. Team Name" or rank in table cells
    # D1Baseball uses structured HTML tables for their Top 25
    patterns = [
        # Table row: rank cell + team name cell
        r'<td[^>]*>\s*(\d+)\s*</td>\s*<td[^>]*>\s*([A-Z][a-zA-Z .&\']+(?:\s[A-Z][a-zA-Z .&\']+)*)',
        # List item: "1. Team Name"
        r'(?:^|\n)\s*(\d+)\.\s+([A-Z][a-zA-Z .&\']+(?:\s[A-Z][a-zA-Z .&\']+)*)',
        # Numbered list in article text
        r'(\d+)\s*\.\s*([A-Z][a-zA-Z .&\']{3,})',
    ]

    for pattern in patterns:
        matches = re.findall(pattern, html, re.MULTILINE)
        for rank_str, name in matches:
            rank = int(rank_str)
            name = re.sub(r'\s+', ' ', name).strip()
            # Filter out non-team strings
            if (1 <= rank <= 25 and
                    len(name) > 3 and
                    not any(skip in name for skip in ['Click', 'Read', 'More', 'Share', 'View'])):
                if name not in rankings:
                    rankings[name] = rank

        if len(rankings) >= 20:
            break

    return rankings


def normalize_name(name):
    """Normalize team name for matching."""
    return name.strip().lower().replace("&", "and")


def match_to_sec_name(name):
    """Return our StatCrew name if this is an SEC team, else return as-is."""
    normalized = normalize_name(name)
    for d1_name, our_name in NAME_MAP.items():
        if normalize_name(d1_name) == normalized:
            return our_name
    return name


def scrape_rankings():
    """
    Try multiple sources to get the current D1Baseball Top 25.
    Returns dict: { "team_name": rank_int }
    """
    rankings = {}
    source = None

    # Try D1Baseball first
    print("Fetching D1Baseball rankings...")
    try:
        html = fetch_url(RANKINGS_URL)
        rankings = parse_d1baseball_rankings(html)
        if len(rankings) >= 20:
            source = "d1baseball.com"
            print(f"  ✓ Got {len(rankings)} teams from D1Baseball")
        else:
            print(f"  ⚠ Only {len(rankings)} teams parsed from D1Baseball, trying NCAA.com")
    except Exception as e:
        print(f"  ✗ D1Baseball failed: {e}")

    # Fall back to NCAA.com
    if len(rankings) < 20:
        print("Fetching NCAA.com rankings...")
        try:
            html = fetch_url(NCAA_URL)
            ncaa_rankings = parse_ncaa_rankings(html)
            if len(ncaa_rankings) > len(rankings):
                rankings = ncaa_rankings
                source = "ncaa.com"
                print(f"  ✓ Got {len(rankings)} teams from NCAA.com")
            else:
                print(f"  ⚠ NCAA.com also returned limited results ({len(ncaa_rankings)} teams)")
        except Exception as e:
            print(f"  ✗ NCAA.com failed: {e}")

    if not rankings:
        raise RuntimeError("Could not fetch rankings from any source.")

    # Normalize team names
    normalized = {}
    for name, rank in rankings.items():
        clean = match_to_sec_name(name)
        normalized[clean] = rank

    return normalized, source


def get_current_week_monday():
    """Return the date of this week's Monday."""
    today = datetime.now().date()
    days_since_monday = today.weekday()  # 0=Monday
    return today - __import__('datetime').timedelta(days=days_since_monday)


def rankings_are_current(out_dir):
    """
    Check if rankings.json was written this week (on or after Monday).
    Returns (is_current: bool, updated_at: str|None)
    """
    rankings_path = Path(out_dir) / "overall" / "rankings.json"
    if not rankings_path.exists():
        return False, None

    try:
        with open(rankings_path) as f:
            data = json.load(f)
        updated_str = data.get("updated_at", "")
        if not updated_str:
            return False, None

        updated = datetime.fromisoformat(updated_str).date()
        this_monday = get_current_week_monday()
        is_current = updated >= this_monday
        return is_current, updated_str
    except Exception:
        return False, None


def write_rankings(rankings, source, out_dir):
    """Write rankings.json to overall/ and conference/ scopes."""
    output = {
        "source":     source or "unknown",
        "updated_at": datetime.now().isoformat(),
        "rankings":   rankings,
    }
    out_dir = Path(out_dir)
    for scope in ["overall", "conference"]:
        scope_dir = out_dir / scope
        scope_dir.mkdir(parents=True, exist_ok=True)
        out_path = scope_dir / "rankings.json"
        with open(out_path, "w") as f:
            json.dump(output, f, indent=2)
        print(f"  Wrote: {out_path} ({len(rankings)} teams)")


def print_rankings(rankings, source):
    print("\nTop 25 Rankings:")
    for name, rank in sorted(rankings.items(), key=lambda x: x[1]):
        sec_marker = " ← SEC" if name in NAME_MAP.values() else ""
        print(f"  {rank:2}. {name}{sec_marker}")
    print(f"\nSource: {source}")
    sec_ranked = {k: v for k, v in rankings.items() if k in NAME_MAP.values()}
    print(f"\nSEC teams ranked ({len(sec_ranked)}):")
    for name, rank in sorted(sec_ranked.items(), key=lambda x: x[1]):
        print(f"  #{rank} {name}")


def main():
    ap = argparse.ArgumentParser(description="Scrape D1Baseball Top 25 rankings")
    ap.add_argument("--out", default=str(OUT_DIR),
                    help=f"Output directory (default: {OUT_DIR})")
    ap.add_argument("--dry-run", action="store_true",
                    help="Print rankings without writing file")
    ap.add_argument("--force", action="store_true",
                    help="Write even if rankings are already current this week")
    ap.add_argument("--retry", action="store_true",
                    help="Keep retrying every 30 min until this week's rankings appear")
    args = ap.parse_args()

    # Check if we already have current rankings
    if not args.force and not args.dry_run:
        is_current, updated_at = rankings_are_current(args.out)
        if is_current:
            print(f"Rankings already current for this week (updated: {updated_at})")
            print("Use --force to overwrite.")
            return

    if args.retry:
        # Retry loop — keep trying every 30 minutes until this week's rankings land
        import time
        print(f"Retry mode: will check every 30 minutes until this week's rankings appear.")
        attempts = 0
        while True:
            attempts += 1
            print(f"\nAttempt {attempts} — {datetime.now().strftime('%H:%M:%S')}")
            try:
                rankings, source = scrape_rankings()
                if len(rankings) >= 20:
                    write_rankings(rankings, source, args.out)
                    print_rankings(rankings, source)
                    print(f"\n✓ Rankings written after {attempts} attempt(s).")
                    break
                else:
                    print(f"  Only {len(rankings)} teams found — rankings may not be updated yet.")
            except Exception as e:
                print(f"  Error: {e}")

            print(f"  Waiting 30 minutes before next attempt...")
            time.sleep(30 * 60)
        return

    # Normal single run
    try:
        rankings, source = scrape_rankings()
    except RuntimeError as e:
        print(f"\nERROR: {e}")
        sys.exit(1)

    if args.dry_run:
        print_rankings(rankings, source)
        return

    write_rankings(rankings, source, args.out)
    print_rankings(rankings, source)


if __name__ == "__main__":
    main()