#!/usr/bin/env python3
"""
SEC Baseball Roster Scraper
Scrapes official roster pages for all 16 SEC baseball teams
and builds a canonical roster database for name normalization.

Usage:
  python3 scrape_rosters.py                    # scrape all teams
  python3 scrape_rosters.py --team ALA         # scrape one team
  python3 scrape_rosters.py --dry-run          # print without saving

Output:
  data/rosters/baseball/2026/rosters.json      # raw scraped data
  data/rosters/baseball/2026/roster.db         # SQLite with name aliases
  Also updates sec_stats_baseball.db with roster table
"""

import argparse
import json
import re
import sqlite3
import time
import urllib.request
import urllib.parse
from pathlib import Path

# ---------------------------------------------------------------------------
# SEC team roster page URLs (Sidearm Sports platform)
# ---------------------------------------------------------------------------

SEC_ROSTER_URLS = {
    "ALA":  "https://rolltide.com/sports/baseball/roster",
    "ARK":  "https://arkansasrazorbacks.com/sport/m-basebl/roster/",
    "AUB":  "https://auburntigers.com/sports/baseball/roster/season/2026",
    "FLA":  "https://floridagators.com/sports/baseball/roster",
    "UGA":  "https://georgiadogs.com/sports/baseball/roster",
    "UK":   "https://ukathletics.com/sports/baseball/roster/season/2026",
    "LSU":  "https://lsusports.net/sports/bsb/roster/season/2026",
    "MISS": "https://olemisssports.com/sports/baseball/roster",
    "MSU":  "https://hailstate.com/sports/baseball/roster",
    "MIZ":  "https://mutigers.com/sports/baseball/roster",
    "OKLA": "https://soonersports.com/sports/baseball/roster",
    "SC":   "https://gamecocksonline.com/sports/baseball/roster/season/2026",
    "TENN": "https://utsports.com/sports/baseball/roster",
    "TEX":  "https://texaslonghorns.com/sports/baseball/roster",
    "TAMU": "https://12thman.com/sports/baseball/roster",
    "VANDY":"https://vucommodores.com/sports/baseball/roster",
}

SEC_TEAM_NAMES = {
    "ALA":  "Alabama",
    "ARK":  "Arkansas",
    "AUB":  "Auburn",
    "FLA":  "Florida",
    "UGA":  "Georgia",
    "UK":   "Kentucky",
    "LSU":  "LSU",
    "MISS": "Ole Miss",
    "MSU":  "Mississippi State",
    "MIZ":  "Missouri",
    "OKLA": "Oklahoma",
    "SC":   "South Carolina",
    "TENN": "Tennessee",
    "TEX":  "Texas",
    "TAMU": "Texas A&M",
    "VANDY":"Vanderbilt",
}

SEASON = "2026"
SPORT  = "baseball"

# ---------------------------------------------------------------------------
# Sidearm roster JSON endpoint pattern
# ---------------------------------------------------------------------------

def get_sidearm_roster(team_code: str, base_url: str) -> list[dict]:
    """
    Fetch roster using multiple strategies in priority order:
    1. /roster/print  — clean HTML table, works on most Sidearm sites
    2. /services/roster.ashx — JSON endpoint (some schools)
    3. HTML scrape of main roster page — fallback
    """
    domain = urllib.parse.urlparse(base_url).netloc
    base_no_slash = base_url.rstrip("/")

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/json,*/*",
        "Referer": f"https://{domain}/",
    }

    # ── Strategy 1: /roster/print (cleanest — standard HTML table) ───────────
    # If URL already ends in /print, use as-is; otherwise append /print
    if base_no_slash.endswith('/print'):
        print_url = base_no_slash
    else:
        print_url = f"{base_no_slash}/print"
    try:
        req = urllib.request.Request(print_url, headers=headers)
        with urllib.request.urlopen(req, timeout=15) as r:
            html = r.read().decode("utf-8", errors="replace")
        players = parse_print_roster(html, team_code)
        if players:
            print(f"  ✓ /roster/print ({len(players)} players)")
            return players
        else:
            print(f"  ↩ /roster/print returned 0 players, trying next strategy")
    except Exception as e:
        print(f"  ↩ /roster/print failed ({e}), trying next strategy")

    # ── Strategy 2: JSON endpoint ─────────────────────────────────────────────
    json_endpoints = [
        f"https://{domain}/services/roster.ashx?sport={SPORT}&season={SEASON}",
        f"https://{domain}/roster.aspx?sport={SPORT}&season={SEASON}&format=json",
    ]
    for endpoint in json_endpoints:
        try:
            req = urllib.request.Request(endpoint, headers=headers)
            with urllib.request.urlopen(req, timeout=15) as r:
                raw = r.read()
            if raw.strip().startswith(b"[") or raw.strip().startswith(b"{"):
                data = json.loads(raw)
                players = parse_sidearm_json(data, team_code)
                if players:
                    print(f"  ✓ JSON endpoint ({len(players)} players)")
                    return players
        except Exception:
            continue

    # ── Strategy 3: OAS/Nuxt website-api (Auburn) ─────────────────────────────
    if team_code == "AUB":
        players = fetch_auburn_api_roster(team_code, headers)
        if players:
            print(f"  ✓ Auburn website-api ({len(players)} players)")
            return players

    # ── Strategy 4: WordPress/WMT HTML roster ───────────────────────────────
    wp_urls = list(dict.fromkeys([  # deduplicate while preserving order
        base_url,
        f"https://{domain}/sports/bsb/roster/season/{SEASON}",
        f"https://{domain}/sports/baseball/roster/season/{SEASON}",
        f"https://{domain}/sport/m-basebl/roster/",
        f"https://{domain}/sports/baseball/roster",
        f"https://{domain}/sports/bsb/roster/",
    ]))
    for wp_url in wp_urls:
        try:
            req = urllib.request.Request(wp_url, headers=headers)
            with urllib.request.urlopen(req, timeout=15) as r:
                html_wp = r.read().decode("utf-8", errors="replace")
            # Try WMT table first (SC, Vandy, UK), then LSU-style schema.org
            players = parse_wmt_table_roster(html_wp, team_code)
            if players:
                print(f"  ✓ WMT table ({len(players)} players) from {wp_url}")
                return players
            players = parse_lsu_style_roster(html_wp, team_code)
            if players:
                print(f"  ✓ WordPress HTML ({len(players)} players) from {wp_url}")
                return players
            print(f"  ↩ WordPress HTML 0 players from {wp_url}")
        except Exception as e:
            print(f"  ↩ {wp_url} failed: {e}")
            continue

    # ── Strategy 5: Generic HTML scrape ──────────────────────────────────────
    print(f"  ↩ Falling back to HTML scrape for {team_code}")
    return scrape_html_roster(base_url, team_code, headers)


def parse_print_roster(html: str, team_code: str) -> list:
    """
    Parse the /roster/print page — a standard HTML table with columns:
    # | Name | Pos. | B/T | Ht. | Wt. | Yr. | Hometown | ...

    Works for all Sidearm Sports sites (all 16 SEC schools use Sidearm).
    """
    players = []

    # Find all <tr> rows
    rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html, re.DOTALL | re.IGNORECASE)
    if not rows:
        return []

    # Find header row to determine column positions
    header_row = None
    header_cols = []
    for row in rows:
        cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
        cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
        # Header row has "Name" or "Player"
        if any(c.lower() in ('name', 'player', 'full name') for c in cells):
            header_cols = [c.lower() for c in cells]
            header_row = row
            break

    if not header_cols:
        # Try to infer: first data row often has # | Name pattern
        # Just parse assuming: col0=#, col1=Name, col2=Pos, col6=Yr
        for row in rows[1:]:  # skip assumed header
            cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
            cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
            if len(cells) < 3:
                continue
            # Heuristic: col0 is jersey (number or empty), col1 is name
            name = cells[1] if len(cells) > 1 else ""
            if not re.match(r'^[A-Z][a-z]', name):
                continue
            parts = name.strip().split()
            if len(parts) < 2:
                continue
            jersey = cells[0].strip() if cells[0].strip().isdigit() else ""
            pos    = cells[2].strip() if len(cells) > 2 else ""
            yr     = cells[6].strip() if len(cells) > 6 else ""
            players.append(_make_player(name, team_code, jersey, pos, yr))
        return players

    # Use header to find column indices
    name_idx   = next((i for i, h in enumerate(header_cols) if 'name' in h or 'player' in h), 1)
    jersey_idx = next((i for i, h in enumerate(header_cols) if h in ('#', 'no', 'no.', 'number', '#')), 0)
    pos_idx    = next((i for i, h in enumerate(header_cols) if 'pos' in h), 2)
    yr_idx     = next((i for i, h in enumerate(header_cols) if h in ('yr', 'yr.', 'year', 'class', 'cl', 'cl.')), -1)

    for row in rows:
        if row == header_row:
            continue
        cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
        cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
        if len(cells) <= name_idx:
            continue
        name = cells[name_idx].strip()
        # Must look like a real name: "First Last"
        if not re.match(r'^[A-Z][a-z]', name) or len(name.split()) < 2:
            continue
        jersey = cells[jersey_idx].strip() if jersey_idx < len(cells) else ""
        pos    = cells[pos_idx].strip()    if pos_idx    < len(cells) else ""
        yr     = cells[yr_idx].strip()     if yr_idx >= 0 and yr_idx < len(cells) else ""
        players.append(_make_player(name, team_code, jersey, pos, yr))

    return players


def _make_player(name: str, team_code: str, jersey: str = "", pos: str = "", yr: str = "") -> dict:
    """Create a normalized player dict from a display name."""
    # Handle "Last, First" format
    if "," in name:
        parts = name.split(",", 1)
        last, first = parts[0].strip(), parts[1].strip()
        display = f"{first} {last}"
    else:
        parts = name.strip().split()
        first = parts[0] if parts else ""
        last  = parts[-1] if len(parts) > 1 else ""
        display = name.strip()

    return {
        "team_code":    team_code,
        "team_name":    SEC_TEAM_NAMES[team_code],
        "first_name":   first,
        "last_name":    last,
        "display_name": display,
        "jersey":       jersey,
        "position":     pos,
        "class_":       yr,
        "height": "", "weight": "", "hometown": "", "espn_id": "",
    }


def parse_sidearm_json(data, team_code: str) -> list:
    """Parse Sidearm roster JSON into normalized player dicts."""
    players = []
    items = data if isinstance(data, list) else data.get("data", data.get("roster", data.get("players", [])))

    for p in items:
        if not isinstance(p, dict):
            continue
        first = (p.get("first_name") or p.get("firstName") or "").strip()
        last  = (p.get("last_name")  or p.get("lastName")  or "").strip()
        if not first and not last:
            # Try "name" field
            name = p.get("name") or p.get("display_name") or ""
            parts = name.strip().split()
            if len(parts) >= 2:
                first, last = parts[0], parts[-1]
            else:
                last = name.strip()

        if not (first or last):
            continue

        players.append({
            "team_code":    team_code,
            "team_name":    SEC_TEAM_NAMES[team_code],
            "first_name":   first,
            "last_name":    last,
            "display_name": f"{first} {last}".strip(),
            "jersey":       str(p.get("jersey") or p.get("number") or "").strip(),
            "position":     str(p.get("position") or p.get("pos") or "").strip(),
            "class_":       str(p.get("academic_year") or p.get("class") or p.get("year") or "").strip(),
            "height":       str(p.get("height") or "").strip(),
            "weight":       str(p.get("weight") or "").strip(),
            "hometown":     str(p.get("hometown") or "").strip(),
            "espn_id":      str(p.get("espn_id") or p.get("id") or "").strip(),
        })

    return players


def parse_lsu_style_roster(html, team_code):
    """Parse LSU/WordPress-style roster pages using schema.org markup and DataTable rows."""
    players_schema = []
    players_table  = []

    # Method 1: itemprop="athlete" blocks with itemprop="name" content attribute
    athlete_blocks = html.split('itemprop="athlete"')
    for block in athlete_blocks[1:]:
        name_m = re.search(r'itemprop="name"[^>]*content="([^"]+)"', block)
        if not name_m:
            continue
        name = name_m.group(1).strip()
        if not re.match(r'^[A-Z][a-z]', name) or len(name.split()) < 2:
            continue
        jersey_m = re.search(r'roster-list_item_number[^>]*>[\s\n]*(\d+)', block)
        jersey   = jersey_m.group(1).strip() if jersey_m else ""
        pos_m    = re.search(r'roster-list_item_info_position[^>]*>[\s\n]*([^\n<]{2,50})', block)
        pos      = pos_m.group(1).strip() if pos_m else ""
        yr_m     = re.search(r'<li>[^<]*</li>\s*<li>[^<]*</li>\s*<li>([^<]{2,40})</li>', block)
        yr       = yr_m.group(1).strip() if yr_m else ""
        players_schema.append(_make_player(name, team_code, jersey, pos, yr))

    # Method 2: DataTable <tr class="odd|even"> rows
    rows = re.findall(r'<tr[^>]*class="(?:odd|even)"[^>]*>(.*?)</tr>', html, re.DOTALL)
    for row in rows:
        cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL)
        cells = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
        if len(cells) < 3:
            continue
        jersey = cells[0] if cells[0].isdigit() else ""
        name   = cells[1] if len(cells) > 1 else ""
        pos    = cells[2] if len(cells) > 2 else ""
        yr     = cells[5] if len(cells) > 5 else ""
        if not re.match(r'^[A-Z][a-z]', name) or len(name.split()) < 2:
            continue
        players_table.append(_make_player(name, team_code, jersey, pos, yr))

    return players_schema if len(players_schema) >= len(players_table) else players_table


def parse_wmt_table_roster(html: str, team_code: str) -> list[dict]:
    """
    Parse WordPress/WMT-themed roster pages that use an HTML <table>.
    Works for South Carolina, Vanderbilt, Kentucky, and similar WMT sites.
    Table has columns like: Number, Name, Position, Height, Weight, Class, ...
    """
    # Find any table that looks like a roster table
    table_match = re.search(
        r'<table[^>]*id="[^"]*players[^"]*"[^>]*>(.*?)</table>',
        html, re.DOTALL | re.IGNORECASE
    )
    if not table_match:
        # Arkansas/older WMT: table with rostname/rostnum header classes
        table_match = re.search(
            r'<table[^>]*class="[^"]*table[^"]*"[^>]*>(?=.*?rostname)(.*?)</table>',
            html, re.DOTALL | re.IGNORECASE
        )
    if not table_match:
        table_match = re.search(
            r'<table[^>]*class="[^"]*roster[^"]*"[^>]*>(.*?)</table>',
            html, re.DOTALL | re.IGNORECASE
        )
    if not table_match:
        return []

    table_html = table_match.group(1)
    rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL)
    if not rows:
        return []

    # Parse header to find column indices
    header_cols = []
    header_idx = -1
    for i, row in enumerate(rows):
        ths = re.findall(r'<th[^>]*>(.*?)</th>', row, re.DOTALL | re.IGNORECASE)
        if ths:
            header_cols = [re.sub(r'<[^>]+>', '', h).strip().lower() for h in ths]
            header_idx = i
            break

    if not header_cols:
        return []

    # Map column names to indices
    num_idx  = next((i for i, h in enumerate(header_cols) if h in ('number', 'num', '#', 'no', 'no.')), -1)
    name_idx = next((i for i, h in enumerate(header_cols) if 'name' in h), -1)
    pos_idx  = next((i for i, h in enumerate(header_cols) if 'pos' in h), -1)
    ht_idx   = next((i for i, h in enumerate(header_cols) if h in ('height', 'ht', 'ht.')), -1)
    wt_idx   = next((i for i, h in enumerate(header_cols) if h in ('weight', 'wt', 'wt.')), -1)
    yr_idx   = next((i for i, h in enumerate(header_cols) if h in ('class', 'cl', 'cl.', 'yr', 'yr.', 'year')), -1)
    ht_idx2  = next((i for i, h in enumerate(header_cols) if 'hometown' in h), -1)

    players = []
    for row in rows[header_idx + 1:]:
        cells = re.findall(r'<td[^>]*>(.*?)</td>', row, re.DOTALL | re.IGNORECASE)
        cells_text = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
        if len(cells_text) < 3:
            continue

        name = cells_text[name_idx] if name_idx >= 0 and name_idx < len(cells_text) else ""
        name = re.sub(r'\s+', ' ', name).strip()
        if not name or not re.match(r'^[A-Z][a-z]', name) or len(name.split()) < 2:
            continue

        jersey = cells_text[num_idx].strip() if num_idx >= 0 and num_idx < len(cells_text) else ""
        pos    = cells_text[pos_idx].strip() if pos_idx >= 0 and pos_idx < len(cells_text) else ""
        ht     = cells_text[ht_idx].strip() if ht_idx >= 0 and ht_idx < len(cells_text) else ""
        wt     = re.sub(r'\s*lbs\.?', '', cells_text[wt_idx]).strip() if wt_idx >= 0 and wt_idx < len(cells_text) else ""
        yr     = cells_text[yr_idx].strip() if yr_idx >= 0 and yr_idx < len(cells_text) else ""
        hometown = cells_text[ht_idx2].strip() if ht_idx2 >= 0 and ht_idx2 < len(cells_text) else ""

        p = _make_player(name, team_code, jersey, pos, yr)
        p["height"] = ht
        p["weight"] = wt
        p["hometown"] = hometown
        players.append(p)

    return players


def fetch_auburn_api_roster(team_code: str, headers: dict) -> list[dict]:
    """
    Fetch Auburn roster from the OAS/Nuxt website-api endpoint.
    Returns normalized player dicts.
    """
    api_url = (
        "https://auburntigers.com/website-api/player-rosters"
        "?filter[roster_id]=1026"
        "&include=player,classLevel,playerPosition"
        "&per_page=200&page=1"
    )
    try:
        req = urllib.request.Request(api_url, headers=headers)
        with urllib.request.urlopen(req, timeout=15) as r:
            data = json.loads(r.read().decode("utf-8", errors="replace"))
    except Exception as e:
        print(f"  ↩ Auburn API failed: {e}")
        return []

    players = []
    for entry in data.get("data", []):
        player_info = entry.get("player", {})
        first = (player_info.get("first_name") or "").strip()
        last  = (player_info.get("last_name") or "").strip()
        if not first and not last:
            continue

        pos_obj = entry.get("player_position") or {}
        cls_obj = entry.get("class_level") or {}
        ht_ft   = entry.get("height_feet") or player_info.get("height_feet") or ""
        ht_in   = entry.get("height_inches") or player_info.get("height_inches") or ""
        height  = f"{ht_ft}-{ht_in}" if ht_ft else ""
        weight  = str(entry.get("weight") or player_info.get("weight") or "")

        players.append({
            "team_code":    team_code,
            "team_name":    SEC_TEAM_NAMES[team_code],
            "first_name":   first,
            "last_name":    last,
            "display_name": f"{first} {last}".strip(),
            "jersey":       str(entry.get("jersey_number") or ""),
            "position":     pos_obj.get("abbreviation") or pos_obj.get("name") or "",
            "class_":       cls_obj.get("name") or "",
            "height":       height,
            "weight":       weight,
            "hometown":     player_info.get("hometown") or "",
            "espn_id":      "",
        })

    return players


def scrape_html_roster(url: str, team_code: str, headers: dict) -> list[dict]:
    """
    Fallback HTML scraper for Sidearm roster pages.
    Looks for structured name/jersey/position data in the HTML.
    """
    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=15) as r:
            html = r.read().decode("utf-8", errors="replace")
    except Exception as e:
        print(f"  ✗ Could not fetch {url}: {e}")
        return []

    players = []

    # Pattern 1: JSON-LD or embedded JS data
    json_match = re.search(r'var\s+rosterData\s*=\s*(\[.*?\]);', html, re.DOTALL)
    if json_match:
        try:
            data = json.loads(json_match.group(1))
            return parse_sidearm_json(data, team_code)
        except:
            pass

    # Pattern 2: Sidearm roster card HTML pattern
    # <li class="s-person-details">
    #   <h2 class="s-person-details__personal-single-line">Jake Brown</h2>
    pattern = re.compile(
        r'<[^>]+class="[^"]*s-person-details[^"]*"[^>]*>.*?'
        r'(?:data-first-name="([^"]*)"[^>]*data-last-name="([^"]*)"'
        r'|data-last-name="([^"]*)"[^>]*data-first-name="([^"]*)")',
        re.DOTALL | re.IGNORECASE
    )
    for m in pattern.finditer(html):
        first = (m.group(1) or m.group(4) or "").strip()
        last  = (m.group(2) or m.group(3) or "").strip()
        if first or last:
            players.append({
                "team_code":    team_code,
                "team_name":    SEC_TEAM_NAMES[team_code],
                "first_name":   first,
                "last_name":    last,
                "display_name": f"{first} {last}".strip(),
                "jersey": "", "position": "", "class_": "",
                "height": "", "weight": "", "hometown": "", "espn_id": "",
            })

    if not players:
        # Pattern 3: generic name extraction from roster cards
        name_pat = re.compile(
            r'<(?:h[123]|span)[^>]+class="[^"]*(?:name|player-name|roster-name)[^"]*"[^>]*>'
            r'\s*([A-Z][a-z]+ [A-Z][a-zA-Z\'\-]+(?:\s+(?:Jr\.|Sr\.|II|III|IV))?)\s*</',
            re.IGNORECASE
        )
        seen = set()
        for m in name_pat.finditer(html):
            name = m.group(1).strip()
            if name in seen: continue
            seen.add(name)
            parts = name.split()
            first = parts[0] if parts else ""
            last  = parts[-1] if len(parts) > 1 else ""
            if first and last:
                players.append({
                    "team_code":    team_code,
                    "team_name":    SEC_TEAM_NAMES[team_code],
                    "first_name":   first,
                    "last_name":    last,
                    "display_name": name,
                    "jersey": "", "position": "", "class_": "",
                    "height": "", "weight": "", "hometown": "", "espn_id": "",
                })

    print(f"  {'✓' if players else '✗'} HTML scraped {len(players)} players for {team_code}")
    return players


# ---------------------------------------------------------------------------
# Database setup
# ---------------------------------------------------------------------------

def setup_roster_table(conn: sqlite3.Connection):
    conn.execute("""
        CREATE TABLE IF NOT EXISTS roster (
            id              INTEGER PRIMARY KEY AUTOINCREMENT,
            season          TEXT NOT NULL,
            sport           TEXT NOT NULL,
            team_code       TEXT NOT NULL,
            team_name       TEXT NOT NULL,
            first_name      TEXT NOT NULL,
            last_name       TEXT NOT NULL,
            display_name    TEXT NOT NULL,
            jersey          TEXT,
            position        TEXT,
            class_          TEXT,
            height          TEXT,
            weight          TEXT,
            hometown        TEXT,
            espn_id         TEXT,
            scraped_at      TEXT DEFAULT (datetime('now')),
            UNIQUE(season, sport, team_code, display_name)
        )
    """)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS roster_aliases (
            id              INTEGER PRIMARY KEY AUTOINCREMENT,
            roster_id       INTEGER REFERENCES roster(id),
            alias           TEXT NOT NULL,
            source          TEXT,  -- 'xml', 'manual', 'scraped'
            UNIQUE(roster_id, alias)
        )
    """)
    # Index for fast lookup by alias
    conn.execute("""
        CREATE INDEX IF NOT EXISTS idx_aliases_alias
        ON roster_aliases(alias)
    """)
    conn.commit()


def upsert_players(conn: sqlite3.Connection, players: list[dict]):
    """Insert or update players, auto-generating common aliases."""
    for p in players:
        # Insert canonical record
        cur = conn.execute("""
            INSERT INTO roster
                (season, sport, team_code, team_name,
                 first_name, last_name, display_name,
                 jersey, position, class_, height, weight, hometown, espn_id)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT(season, sport, team_code, display_name)
            DO UPDATE SET
                jersey   = excluded.jersey,
                position = excluded.position,
                class_   = excluded.class_,
                espn_id  = CASE WHEN excluded.espn_id != '' THEN excluded.espn_id ELSE roster.espn_id END,
                scraped_at = datetime('now')
            RETURNING id
        """, (
            SEASON, SPORT, p["team_code"], p["team_name"],
            p["first_name"], p["last_name"], p["display_name"],
            p["jersey"], p["position"], p["class_"],
            p["height"], p["weight"], p["hometown"], p["espn_id"],
        ))
        row = cur.fetchone()
        if not row:
            cur2 = conn.execute(
                "SELECT id FROM roster WHERE season=? AND sport=? AND team_code=? AND display_name=?",
                (SEASON, SPORT, p["team_code"], p["display_name"])
            )
            row = cur2.fetchone()
        if not row:
            continue
        roster_id = row[0]

        # Auto-generate common aliases
        first, last = p["first_name"], p["last_name"]
        display     = p["display_name"]

        aliases = set()
        aliases.add(display)                     # "Jake Brown"
        aliases.add(display.upper())             # "JAKE BROWN"
        if last:
            aliases.add(last)                    # "Brown"
            aliases.add(last.upper())            # "BROWN"
        if first and last:
            aliases.add(f"{last}, {first}")      # "Brown, Jake"
            aliases.add(f"{last},{first}")        # "Brown,Jake"
            aliases.add(f"{last.upper()}, {first.upper()}")
            # Initial + Last: "J. Brown"
            aliases.add(f"{first[0]}. {last}")
            aliases.add(f"{first[0]}.{last}")
            # Short forms: first 4 chars of last
            if len(last) > 4:
                aliases.add(f"{first} {last[:4]}")  # "Jake Brow"

        for alias in aliases:
            if alias.strip():
                conn.execute("""
                    INSERT OR IGNORE INTO roster_aliases (roster_id, alias, source)
                    VALUES (?, ?, 'scraped')
                """, (roster_id, alias.strip()))

    conn.commit()


def lookup_player(conn: sqlite3.Connection, raw_name: str, team_code: str):
    """
    Look up a player by any known alias within a team.
    Returns canonical roster record or None if not found.
    """
    row = conn.execute("""
        SELECT r.* FROM roster r
        JOIN roster_aliases a ON a.roster_id = r.id
        WHERE a.alias = ? AND r.team_code = ? AND r.season = ? AND r.sport = ?
        LIMIT 1
    """, (raw_name.strip(), team_code, SEASON, SPORT)).fetchone()
    return dict(row) if row else None


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def seed_from_db(conn: sqlite3.Connection, team_codes: list = None):
    """
    Seed roster table from existing player_hitting/player_pitching data.
    Useful for teams where web scraping fails but we have StatCrew XML data.
    Maps team_name from DB to team_code using SEC_TEAM_NAMES reverse lookup.
    """
    name_to_code = {v: k for k, v in SEC_TEAM_NAMES.items()}

    # Get all distinct players from stats tables
    rows = conn.execute("""
        SELECT DISTINCT player_name, short_name, team_name, uni, 'batting' as src
        FROM player_hitting
        UNION
        SELECT DISTINCT player_name, short_name, team_name, uni, 'pitching' as src
        FROM player_pitching
        UNION
        SELECT DISTINCT player_name, short_name, team_name, uni, 'fielding' as src
        FROM player_fielding
    """).fetchall()

    teams_added = {}
    players = []
    for r in rows:
        tname = r['team_name']
        code  = name_to_code.get(tname)
        if not code: continue
        if team_codes and code not in team_codes: continue
        # Skip if we already have good data for this team from web scrape
        existing = conn.execute(
            "SELECT COUNT(*) FROM roster WHERE team_code=? AND season=? AND sport=?",
            (code, SEASON, SPORT)
        ).fetchone()[0]
        if existing > 5:
            continue  # already well-seeded from web scrape

        name = r['player_name'] or ''
        # Handle last-name-only entries — can't do much without first name
        # but at least get them into the alias table
        parts = name.strip().split()
        if len(parts) == 1:
            first, last = '', parts[0]
        elif ',' in name:
            last, first = name.split(',', 1)
            first, last = first.strip(), last.strip()
        else:
            first, last = parts[0], parts[-1]

        display = f"{first} {last}".strip() if first else last

        p = {
            "team_code":    code,
            "team_name":    tname,
            "first_name":   first,
            "last_name":    last,
            "display_name": display,
            "jersey":       str(r['uni'] or '').strip(),
            "position":     '',
            "class_":       '',
            "height": "", "weight": "", "hometown": "", "espn_id": "",
        }
        key = (code, display)
        if key not in teams_added:
            teams_added[key] = True
            players.append(p)

    if players:
        setup_roster_table(conn)
        upsert_players(conn, players)
        by_team = {}
        for p in players:
            by_team[p['team_code']] = by_team.get(p['team_code'], 0) + 1
        for code, count in sorted(by_team.items()):
            print(f"  Seeded {count} players for {SEC_TEAM_NAMES[code]} ({code}) from DB stats")
    else:
        print("  No new players to seed from DB")
    conn.commit()


def main():
    ap = argparse.ArgumentParser(description="Scrape SEC baseball rosters")
    ap.add_argument("--team",    help="Scrape single team (e.g. ALA)")
    ap.add_argument("--db",      default="sec_stats_baseball.db", help="SQLite DB path")
    ap.add_argument("--out",     default="data/rosters/baseball/2026", help="Output directory")
    ap.add_argument("--dry-run", action="store_true", help="Print without saving")
    ap.add_argument("--delay",   type=float, default=1.5, help="Delay between requests (seconds)")
    ap.add_argument("--seed-missing", action="store_true", help="Seed from DB stats for teams with no roster data")
    args = ap.parse_args()

    out_dir = Path(args.out)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Open DB connection early for seeding
    db_path = Path(args.db)
    conn_early = sqlite3.connect(db_path)
    conn_early.row_factory = sqlite3.Row
    setup_roster_table(conn_early)

    if args.seed_missing:
        print("\nSeeding from DB stats for teams with no scraped roster data...")
        seed_from_db(conn_early, [args.team] if args.team else None)
        conn_early.close()
        return

    conn_early.close()
    teams_to_scrape = {args.team: SEC_ROSTER_URLS[args.team]} if args.team else SEC_ROSTER_URLS
    all_players = []

    for code, url in teams_to_scrape.items():
        print(f"\n{'='*50}")
        print(f"Scraping {SEC_TEAM_NAMES[code]} ({code})")
        print(f"  URL: {url}")
        players = get_sidearm_roster(code, url)
        print(f"  Found: {len(players)} players")
        if players:
            for p in players[:3]:
                print(f"    {p['jersey']:>3}  {p['display_name']:<25}  {p['position']:<8}  {p['class_']}")
        all_players.extend(players)
        if len(teams_to_scrape) > 1:
            time.sleep(args.delay)

    print(f"\n{'='*50}")
    print(f"Total players scraped: {len(all_players)}")

    if args.dry_run:
        print("\nDry run — not saving.")
        return

    # Save raw JSON
    json_path = out_dir / "rosters.json"
    with open(json_path, "w") as f:
        json.dump(all_players, f, indent=2)
    print(f"Saved raw JSON: {json_path}")

    # Save to DB
    db_path = Path(args.db)
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    setup_roster_table(conn)
    upsert_players(conn, all_players)

    # Report
    total = conn.execute(
        "SELECT COUNT(*) FROM roster WHERE season=? AND sport=?", (SEASON, SPORT)
    ).fetchone()[0]
    alias_count = conn.execute(
        "SELECT COUNT(*) FROM roster_aliases"
    ).fetchone()[0]
    print(f"DB: {total} players, {alias_count} aliases → {db_path}")
    conn.close()


if __name__ == "__main__":
    main()
