"""
1. StatsBomb — single source of truth for matches, lineups, events, coaches, stadiums, and players.

 - Players carry a randomly generated date_of_birth
 - Stadiums get a randomly generated capacity and year_built
 - Coach appearances are *collected* during the import
   (one tuple per match per coach) and then collapsed
   into proper stints with realistic start/end dates
   and a generated salary in a single finalisation
   step.

2. Football-data — bookmaker odds and referee assignments for the StatsBomb matches that are already in the DB.

 - A 'Generated' time-series of odds is produced
   for every match at minutes 0, 10, 20, ..., 90 using
   the StatsBomb final score.

3. Transfermarkt — player valuations and transfers (unchanged).

"""

import csv
import json
import math
import os
import random
import re
from datetime import date, datetime, time, timedelta

import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
from rapidfuzz import process as fuzz_process, fuzz

DB_CONFIG = {
    "host":     "<host>",
    "port":     "<port>",
    "dbname":   "<dbname>",
    "user":     "<user>",
    "password": "<password>",
}

# football-data.co.uk CSV folder (one CSV per division/season)
FD_FOLDER = "<fd_root>"

# Transfermarkt Kaggle dataset folder
TM_ROOT = "<transfermarkt_root>"
TM_PLAYERS = f"{TM_ROOT}/players.csv"
TM_VALUATIONS = f"{TM_ROOT}/player_valuations.csv"
TM_TRANSFERS = f"{TM_ROOT}/transfers.csv"

# StatsBomb open-data repo folder
SB_ROOT = "<statsbomb_root>"
SB_COMPETITIONS = os.path.join(SB_ROOT, "data", "competitions.json")
SB_MATCHES_DIR = os.path.join(SB_ROOT, "data", "matches")
SB_LINEUPS_DIR = os.path.join(SB_ROOT, "data", "lineups")
SB_EVENTS_DIR = os.path.join(SB_ROOT, "data", "events")

FD_TEAM_ALIASES: dict[str, str] = {
    "Hamburg": "Hamburger SV",
    "M'gladbach": "Borussia Mönchengladbach",
    "West Ham": "West Ham United",
    "Wolves": "Wolverhampton Wanderers",
    "Man City": "Manchester City",
    "West Brom": "West Bromwich Albion",
    "Man United": "Manchester United",
    "Paris SG": "Paris Saint-Germain",
}

TEAM_FUZZY_THRESHOLD = 80  # team name matching
PLAYER_FUZZY_THRESHOLD = 90  # stricter — player names collide more easily

# StatsBomb matches to process between DB commits.
SB_COMMIT_EVERY = 500

PLAYER_DOB_START = date(1975, 1, 1)
PLAYER_DOB_END = date(2008, 12, 31)

STADIUM_CAPACITY_MIN = 8_000
STADIUM_CAPACITY_MAX = 90_000

STADIUM_YEAR_MIN = 1900
STADIUM_YEAR_MAX = 2015

COACH_SALARY_MIN = 500_000
COACH_SALARY_MAX = 10_000_000

COACH_STILL_ACTIVE_DAYS = 180  # 6 months

COACH_STINT_GAP_DAYS = 90

COACH_STINT_END_BUFFER_DAYS = 30

REFEREE_DOB_START = date(1955, 1, 1)
REFEREE_DOB_END = date(1985, 12, 31)


# pick a random player date of birth within configured bounds
def _random_player_dob() -> date:
    span = (PLAYER_DOB_END - PLAYER_DOB_START).days
    return PLAYER_DOB_START + timedelta(days=random.randint(0, span))


# pick a stadium capacity in the configured range
def _random_stadium_capacity() -> int:
    return random.randint(STADIUM_CAPACITY_MIN, STADIUM_CAPACITY_MAX)


# pick a plausible stadium build year
def _random_stadium_year() -> int:
    return random.randint(STADIUM_YEAR_MIN, STADIUM_YEAR_MAX)


# pick a random coach salary and round to nearest 1000
def _random_coach_salary() -> int:
    return round(random.randint(COACH_SALARY_MIN, COACH_SALARY_MAX), -3)


# pick a random referee date of birth within configured bounds
def _random_referee_dob() -> date:
    span = (REFEREE_DOB_END - REFEREE_DOB_START).days
    return REFEREE_DOB_START + timedelta(days=random.randint(0, span))


def load_json(path: str):
    with open(path, encoding="utf-8") as f:
        return json.load(f)


# resolves team name to db team_id
# order: cache -> exact match -> alias -> single-word match -> fuzzy -> new team
class TeamResolver:
    def __init__(self, cur, manual_aliases: dict[str, str] | None = None):
        self._by_name: dict[str, int] = {}
        self._by_alias: dict[str, int] = {}
        self._fuzzy_list: list[tuple[str, int]] = []
        self._resolve_cache: dict[str, int | None] = {}
        self._pending_aliases: list[tuple[int, str]] = []

        cur.execute("SELECT team_id, name FROM Team")
        for tid, name in cur.fetchall():
            self._by_name[name.lower()] = tid
            self._fuzzy_list.append((name, tid))

        cur.execute("SELECT team_id, alias_name FROM Team_alias")
        for tid, alias in cur.fetchall():
            self._by_alias[alias.lower()] = tid

        if manual_aliases:
            for src, dst in manual_aliases.items():
                tid = self._by_name.get(dst.lower())
                if tid:
                    self._by_alias[src.lower()] = tid
                else:
                    print(f"  [alias warning] '{dst}' not found in Team table")

    def resolve(self, name: str, country: str | None = None,
                cur=None, create_if_missing: bool = False) -> int | None:
        if not name:
            return None
        if name in self._resolve_cache:
            return self._resolve_cache[name]

        key = name.lower()

        # 1. exact name
        if key in self._by_name:
            tid = self._by_name[key]
            self._resolve_cache[name] = tid
            return tid

        # 2. alias
        if key in self._by_alias:
            tid = self._by_alias[key]
            self._resolve_cache[name] = tid
            return tid

        # 3. word match (e.g. "FC Barcelona" → "Barcelona")
        for word in name.split():
            if len(word) <= 2:
                continue
            w = word.lower()
            tid = self._by_name.get(w) or self._by_alias.get(w)
            if tid:
                print(f"    [word match] '{name}' → '{word}' (team_id={tid})")
                self._pending_aliases.append((tid, name))
                self._by_alias[key] = tid
                self._resolve_cache[name] = tid
                return tid

        # 4. fuzzy match
        is_womens = "women" in key
        candidates = [
            (n, t) for n, t in self._fuzzy_list
            if ("women" in n.lower()) == is_womens
        ]
        if candidates:
            result = fuzz_process.extractOne(
                name,
                [n for n, _ in candidates],
                scorer=fuzz.token_sort_ratio
            )
            if result and result[1] >= TEAM_FUZZY_THRESHOLD:
                best_name, best_score = result[0], result[1]
                tid = next(t for n, t in candidates if n == best_name)
                print(f"    [fuzzy] '{name}' → '{best_name}' (score {best_score})")
                self._pending_aliases.append((tid, name))
                self._by_alias[key] = tid
                self._resolve_cache[name] = tid
                return tid

        # 5. create new
        if create_if_missing and cur is not None:
            print(f"    [new team] inserting '{name}' (country={country})")
            cur.execute(
                "INSERT INTO Team (name, country) VALUES (%s, %s) RETURNING team_id",
                (name[:100], country)
            )
            tid = cur.fetchone()[0]
            self._by_name[key] = tid
            self._fuzzy_list.append((name, tid))
            self._resolve_cache[name] = tid
            return tid

        print(f"    [no match] '{name}'")
        self._resolve_cache[name] = None
        return None

    def flush_aliases(self, cur):
        if not self._pending_aliases:
            return
        execute_values(
            cur,
            "INSERT INTO Team_alias (team_id, alias_name) VALUES %s ON CONFLICT DO NOTHING",
            self._pending_aliases,
            page_size=2000,  # OPT-3
        )
        self._pending_aliases.clear()


# statsbomb

class _SBCache:
    def __init__(self, cur, team_resolver: TeamResolver):
        self.teams = team_resolver  # shared TeamResolver instance
        self.player: dict[str, int] = {}
        self.event_type: dict[str, int] = {}
        self.referee: dict[str, int] = {}
        self.coach: dict[str, int] = {}
        self.division: dict[tuple, int] = {}
        self.season: dict[tuple, int] = {}
        self.stadium: dict[tuple, int] = {}
        self.referee_match_pairs: set[tuple[int, int]] = set()

        # per-match coach appearances, finalised into stints at the end.
        # each entry is (coach_id, team_id, season_id, match_date).
        self.coach_appearances: list[tuple[int, int, int, date]] = []

        # final scores keyed by DB match_id, used by the football-data
        # pipeline to drive the generated odds time-series.
        self.match_scores: dict[int, tuple[int, int]] = {}

        cur.execute("SELECT player_id, name FROM Player")
        for pid, name in cur.fetchall():
            self.player[name] = pid

        cur.execute("SELECT event_type_id, type FROM Event_type")
        for etid, t in cur.fetchall():
            self.event_type[t] = etid

        cur.execute("SELECT referee_id, name FROM Referee")
        for rid, name in cur.fetchall():
            self.referee[name] = rid

        cur.execute("SELECT coach_id, name FROM Coach")
        for cid, name in cur.fetchall():
            self.coach[name] = cid

        cur.execute("SELECT referee_id, match_id FROM Referee_match")
        for rid, mid in cur.fetchall():
            self.referee_match_pairs.add((rid, mid))

        cur.execute("SELECT division_id, name, country FROM Division")
        for did, name, country in cur.fetchall():
            self.division[(name, country)] = did

        cur.execute("SELECT season_id, division_id, season_name FROM Season")
        for sid, did, sname in cur.fetchall():
            self.season[(did, sname)] = sid

        cur.execute("SELECT stadium_id, name, country FROM Stadium")
        for stid, name, country in cur.fetchall():
            self.stadium[(name, country)] = stid


    def ensure_division(self, cur, name: str, country: str | None) -> int:
        key = (name, country)
        if key in self.division:
            return self.division[key]
        cur.execute(
            "INSERT INTO Division (name, country) VALUES (%s,%s) RETURNING division_id",
            (name, country)
        )
        did = cur.fetchone()[0]
        self.division[key] = did
        return did

    def ensure_season(self, cur, division_id: int, season_name: str) -> int:
        key = (division_id, season_name)
        if key in self.season:
            return self.season[key]
        cur.execute(
            "SELECT season_id FROM Season WHERE division_id=%s AND season_name=%s LIMIT 1",
            (division_id, season_name)
        )
        row = cur.fetchone()
        if row is None:
            cur.execute(
                "INSERT INTO Season (division_id, season_name) VALUES (%s,%s) RETURNING season_id",
                (division_id, season_name)
            )
            row = cur.fetchone()
        self.season[key] = row[0]
        return row[0]

    def ensure_stadium(self, cur, name: str, country: str | None) -> int:
        # get existing stadium or insert one with generated capacity/year when missing
        key = (name, country)
        if key in self.stadium:
            return self.stadium[key]
        cur.execute(
            "SELECT stadium_id FROM Stadium WHERE name=%s "
            "AND (country=%s OR (country IS NULL AND %s IS NULL)) LIMIT 1",
            (name, country, country)
        )
        row = cur.fetchone()
        if row:
            sid = row[0]
        else:
            # random reasonable values for the columns the source data lacks.
            cur.execute(
                "INSERT INTO Stadium (name, country, capacity, year_built) "
                "VALUES (%s,%s,%s,%s) RETURNING stadium_id",
                (name[:100], country, _random_stadium_capacity(), _random_stadium_year())
            )
            sid = cur.fetchone()[0]
        self.stadium[key] = sid
        return sid


    def ensure_players(self, cur, names: list[str]):
        # batch-insert unseen players and cache their ids
        new = [(n, _random_player_dob())
               for n in {n for n in names if n and n not in self.player}]
        if not new:
            return
        rows = execute_values(
            cur,
            "INSERT INTO Player (name, date_of_birth) VALUES %s RETURNING player_id, name",
            new, fetch=True, page_size=500,  # OPT-3
        )
        for pid, name in rows:
            self.player[name] = pid

    def ensure_event_types(self, cur, type_names: list[str]):
        new = list({t for t in type_names if t and t not in self.event_type})
        if not new:
            return
        rows = execute_values(
            cur,
            "INSERT INTO Event_type (type) VALUES %s "
            "ON CONFLICT (type) DO UPDATE SET type=EXCLUDED.type "
            "RETURNING event_type_id, type",
            [(t,) for t in new], fetch=True, page_size=2000,  # OPT-3
        )
        for etid, t in rows:
            self.event_type[t] = etid

    def ensure_referee(self, cur, name: str, country: str | None = None) -> int:
        # get existing referee or insert a new one with generated dob
        if name in self.referee:
            return self.referee[name]
        cur.execute("SELECT referee_id FROM Referee WHERE name=%s LIMIT 1", (name,))
        row = cur.fetchone()
        if not row:
            cur.execute(
                "INSERT INTO Referee (name, country, date_of_birth) "
                "VALUES (%s,%s,%s) RETURNING referee_id",
                (name, country, _random_referee_dob())
            )
            row = cur.fetchone()
        self.referee[name] = row[0]
        return row[0]

    def ensure_coach(self, cur, name: str, nationality: str | None) -> int:
        if name in self.coach:
            return self.coach[name]
        cur.execute("SELECT coach_id FROM Coach WHERE name=%s LIMIT 1", (name,))
        row = cur.fetchone()
        if not row:
            cur.execute(
                "INSERT INTO Coach (name, nationality) VALUES (%s,%s) RETURNING coach_id",
                (name, nationality)
            )
            row = cur.fetchone()
        self.coach[name] = row[0]
        return row[0]


def _sb_find_or_create_match(cur, cache: _SBCache, sb_match: dict, comp: dict
                             ) -> tuple[int | None, bool, int | None, str | None]:
    # find matching db match row or create it and cache the final score
    date_str = sb_match.get("match_date")
    if not date_str:
        return None, False, None, None

    home_sb = sb_match["home_team"]["home_team_name"]
    away_sb = sb_match["away_team"]["away_team_name"]
    country = comp.get("country_name") or None

    comp_name = comp.get("competition_name", "Unknown Competition")
    division_id = cache.ensure_division(cur, comp_name[:100], country)
    season_id = cache.ensure_season(cur, division_id, comp.get("season_name", "Unknown")[:20])

    home_id = cache.teams.resolve(home_sb, country=country, cur=cur, create_if_missing=True)
    away_id = cache.teams.resolve(away_sb, country=country, cur=cur, create_if_missing=True)

    if not home_id or not away_id or home_id == away_id:
        if home_id and away_id and home_id == away_id:
            print(f"    [skip] same team resolved for both sides (id={home_id})")
        return None, False, None, None

    sb_stadium = sb_match.get("stadium")
    stadium_name = (sb_stadium.get("name", "").strip() if sb_stadium else "") \
                   or f"Unknown home stadium - {home_sb}"
    stadium_id = cache.ensure_stadium(cur, stadium_name[:100], country)

    cur.execute(
        'SELECT match_id FROM "match" '
        'WHERE home_team_id=%s AND away_team_id=%s AND match_date=%s LIMIT 1',
        (home_id, away_id, date_str)
    )
    row = cur.fetchone()
    if row:
        match_id = row[0]
        was_created = False
    else:
        cur.execute(
            'INSERT INTO "match" (season_id, home_team_id, away_team_id, stadium_id, match_date) '
            'VALUES (%s,%s,%s,%s,%s) RETURNING match_id',
            (season_id, home_id, away_id, stadium_id, date_str)
        )
        match_id = cur.fetchone()[0]
        was_created = True

    # cache the final score for the football-data odds generator.
    home_score = int(sb_match.get("home_score", 0) or 0)
    away_score = int(sb_match.get("away_score", 0) or 0)
    cache.match_scores[match_id] = (home_score, away_score)

    return match_id, was_created, season_id, date_str


# replace placeholder stadium names when statsbomb later provides the real one
def _sb_enrich_stadium(cur, match_id: int, sb_match: dict):
    sb_stadium = sb_match.get("stadium")
    if not sb_stadium:
        return
    real_name = sb_stadium.get("name", "").strip()
    if not real_name:
        return
    cur.execute(
        """UPDATE Stadium s SET name=%s FROM "match" m
           WHERE m.match_id=%s AND m.stadium_id=s.stadium_id
                 AND s.name LIKE 'Unknown home stadium - %%'""",
        (real_name[:100], match_id)
    )


def _sb_enrich_referee(cur, match_id: int, sb_match: dict, cache: _SBCache,
                       country: str | None = None):
    # ensure referee exists and link them to the match
    sb_ref = sb_match.get("referee")
    if not sb_ref:
        return
    ref_name = sb_ref.get("name", "").strip()
    if not ref_name:
        return

    # treat "International" as unusable — use the
    # participating team's country instead.
    if not country or "international" in country.lower():
        home_team = sb_match.get("home_team") or {}
        away_team = sb_match.get("away_team") or {}
        country = (
                          (home_team.get("country") or {}).get("name")
                          or (away_team.get("country") or {}).get("name")
                  ) or None

    ref_id = cache.ensure_referee(cur, ref_name, country)
    if (ref_id, match_id) in cache.referee_match_pairs:
        return
    cur.execute(
        "INSERT INTO Referee_match (referee_id, match_id, role) "
        "VALUES (%s,%s,'main') ON CONFLICT DO NOTHING",
        (ref_id, match_id)
    )
    cache.referee_match_pairs.add((ref_id, match_id))


# collect coach appearances in memory for later stint building
def _sb_collect_coach_appearances(match_id: int, sb_match: dict, cache: _SBCache,
                                  season_id: int, match_date_str: str):
    match_date = date.fromisoformat(match_date_str)

    for side in ("home_team", "away_team"):
        team_data = sb_match.get(side, {})
        team_id = cache.teams.resolve(team_data.get(f"{side}_name", ""))
        if not team_id:
            continue
        for mgr in team_data.get("managers", []):
            name = (mgr.get("name", "") or "").strip()
            if not name:
                continue
            nationality = (mgr.get("country", {}).get("name", "") or "").strip() or None

            cache.coach_appearances.append((name, nationality, team_id, season_id, match_date))


# collapse collected coach appearances into stint rows
def _sb_finalize_coach_assignments(cur, cache: _SBCache):
    if not cache.coach_appearances:
        print("  No coach appearances to finalise.")
        return

    # resolve / create coach rows in bulk before building the stint map.
    # appearances stored as (name, nationality, team_id, season_id, match_date)
    resolved: list[tuple[int, int, int, date]] = []
    for name, nationality, team_id, season_id, match_date in cache.coach_appearances:
        coach_id = cache.ensure_coach(cur, name, nationality)
        resolved.append((coach_id, team_id, season_id, match_date))

    # group by (coach_id, team_id) — one bucket per potential stint sequence.
    groups: dict[tuple[int, int], list[tuple[int, date]]] = {}
    for cid, tid, sid, d in resolved:
        groups.setdefault((cid, tid), []).append((sid, d))


    if groups:
        pair_cids = [cid for cid, _ in groups.keys()]
        pair_tids = [tid for _, tid in groups.keys()]
        cur.execute(
            "DELETE FROM Coach_assignment "
            "WHERE (coach_id, team_id) IN "
            "(SELECT * FROM UNNEST(%s::int[], %s::int[]))",
            (pair_cids, pair_tids)
        )

    rows: list[tuple] = []
    today = date.today()
    for (cid, tid), entries in groups.items():
        # sort by date so the linear walk below detects gaps correctly.
        entries.sort(key=lambda x: x[1])

        # split entries into stints based on gap-from-previous.
        # each stint is itself a list of (season_id, match_date) tuples.
        stints: list[list[tuple[int, date]]] = []
        for entry in entries:
            _, d = entry
            if not stints or (d - stints[-1][-1][1]).days > COACH_STINT_GAP_DAYS:
                stints.append([entry])
            else:
                stints[-1].append(entry)

        # one Coach_assignment row per stint with proper boundaries.
        for i, stint in enumerate(stints):
            start_season = stint[0][0]
            start_d = stint[0][1]
            last_d = stint[-1][1]

            if i + 1 < len(stints):
                # there's another stint: this one ends the day before it starts.
                end_d = stints[i + 1][0][1] - timedelta(days=1)
            elif (today - last_d).days <= COACH_STILL_ACTIVE_DAYS:
                # most recent stint and last match is recent → still active.
                end_d = None
            else:
                # closed historical stint; pad with a small buffer.
                end_d = last_d + timedelta(days=COACH_STINT_END_BUFFER_DAYS)

            rows.append((cid, start_season, tid, _random_coach_salary(), start_d, end_d))

    if rows:
        execute_values(
            cur,
            "INSERT INTO Coach_assignment "
            "(coach_id, season_id, team_id, assignment_value, start_date, end_date) "
            "VALUES %s",
            rows, page_size=2000,  # OPT-3
        )
    print(f"  Coach stints inserted: {len(rows)} "
          f"(from {len(cache.coach_appearances)} per-match appearances)")


# insert lineup file players into lineup rows
def _sb_import_lineups(cur, match_id: int, lineup_path: str, cache: _SBCache) -> int:
    lineups = load_json(lineup_path)
    inserted = 0
    for team_entry in lineups:
        team_id = cache.teams.resolve(team_entry.get("team_name", ""))
        if not team_id:
            continue
        cache.ensure_players(
            cur, [p.get("player_name", "").strip() for p in team_entry.get("lineup", [])]
        )
        lineup_rows = []
        for p in team_entry.get("lineup", []):
            player_name = p.get("player_name", "").strip()
            if not player_name or player_name not in cache.player:
                continue
            positions = p.get("positions", [])
            position = positions[0].get("position", "Unknown") if positions else "Unknown"
            shirt_no = p.get("jersey_number")
            is_starter = any(pos.get("start_reason") == "Starting XI" for pos in positions) \
                if positions else True
            if shirt_no is None:
                continue
            shirt_no = int(shirt_no)
            if shirt_no < 1:
                continue
            lineup_rows.append(
                (match_id, team_id, cache.player[player_name],
                 shirt_no, position[:50], is_starter)
            )
        if lineup_rows:
            execute_values(
                cur,
                "INSERT INTO Lineup (match_id, team_id, player_id, shirt_number, position, is_starter) "
                "VALUES %s ON CONFLICT DO NOTHING",
                lineup_rows, page_size=2000,  # OPT-3
            )
            inserted += len(lineup_rows)
    return inserted


# StatsBomb event-attribute keys we deliberately drop when flattening
_SB_SKIP_ATTR_KEYS = {
    "id", "index", "period", "timestamp", "minute", "second",
    "type", "possession", "possession_team", "play_pattern",
    "team", "player", "position", "location", "match_id",
    "related_events", "off_camera", "out",
}

# don't save this attributes to keep event_attribute < 30M
_SB_SKIP_FULL_KEYS = {"duration", "pass.angle"}


# flatten nested even◘t payload into string key/value attributes
def _sb_flatten_attrs(event: dict, prefix: str = "") -> dict[str, str]:
    result = {}
    for k, v in event.items():
        full_key = f"{prefix}{k}" if prefix else k
        if not prefix and k in _SB_SKIP_ATTR_KEYS:
            continue
        if full_key in _SB_SKIP_FULL_KEYS:
            continue
        if isinstance(v, dict):
            if "name" in v:
                result[full_key] = str(v["name"])
            else:
                result.update(_sb_flatten_attrs(v, prefix=full_key + "."))
        elif isinstance(v, list):
            result[full_key] = json.dumps(v)
        elif v is not None:
            result[full_key] = str(v)
    return result


# build lineup rows from starting xi events when lineup files are missing
def _sb_lineup_from_starting_xi(ev: dict, match_id: int, cache: _SBCache) -> list[tuple]:
    try:
        team_id = cache.teams.resolve(ev.get("team", {}).get("name", ""))
        if not team_id:
            return []
        sb_lineup = ev.get("tactics", {}).get("lineup", [])
        if not sb_lineup:
            return []
        rows = []
        for entry in sb_lineup:
            try:
                player_name = entry["player"]["name"].strip()
                shirt_no = int(entry["jersey_number"])
                position = entry.get("position", {}).get("name", "Unknown") or "Unknown"
            except (KeyError, TypeError, ValueError):
                continue
            if not player_name or shirt_no < 1:
                continue
            player_id = cache.player.get(player_name)
            if not player_id:
                continue
            rows.append((match_id, team_id, player_id, shirt_no, position[:50], True))
        return rows
    except Exception:
        return []


# insert non-starting-xi events and return counts for events and xi rows
def _sb_import_events(cur, match_id: int, events_path: str, cache: _SBCache
                      ) -> tuple[int, int]:
    events = load_json(events_path)

    # pre-create any new players (regular event participants + Starting XI).
    cache.ensure_players(cur, [ev.get("player", {}).get("name", "") for ev in events])
    cache.ensure_players(cur, [
        entry["player"]["name"].strip()
        for ev in events
        if ev.get("type", {}).get("name") == "Starting XI"
        for entry in ev.get("tactics", {}).get("lineup", [])
        if isinstance(entry.get("player"), dict) and entry["player"].get("name", "").strip()
    ])
    cache.ensure_event_types(cur, [ev.get("type", {}).get("name", "") for ev in events])

    event_rows: list[tuple] = []
    attr_sets: list[dict] = []
    xi_lineup_rows: list[tuple] = []

    for ev in events:
        ev_type_name = ev.get("type", {}).get("name", "").strip()
        if not ev_type_name:
            continue
        # starting XI events synthesise lineup rows rather than event rows.
        if ev_type_name == "Starting XI":
            xi_lineup_rows.extend(_sb_lineup_from_starting_xi(ev, match_id, cache))
            continue
        if ev_type_name not in cache.event_type:
            continue
        team_id = cache.teams.resolve(ev.get("team", {}).get("name", ""))
        if not team_id:
            continue
        player_name = ev.get("player", {}).get("name", "")
        if not player_name or player_name not in cache.player:
            continue
        event_rows.append((
            match_id, team_id, cache.player[player_name], cache.event_type[ev_type_name],
            max(0, min(130, int(ev.get("minute", 0)))),
            max(0, min(59, int(ev.get("second", 0)))),
        ))
        attr_sets.append(_sb_flatten_attrs(ev))

    n_xi = 0
    if xi_lineup_rows:
        execute_values(
            cur,
            "INSERT INTO Lineup (match_id, team_id, player_id, shirt_number, position, is_starter) "
            "VALUES %s ON CONFLICT DO NOTHING",
            xi_lineup_rows, page_size=2000,  # OPT-3
        )
        n_xi = len(xi_lineup_rows)

    if not event_rows:
        return 0, n_xi

    inserted_rows = execute_values(
        cur,
        "INSERT INTO Event (match_id, team_id, player_id, event_type_id, minute, second) "
        "VALUES %s RETURNING event_id",
        event_rows, fetch=True, page_size=2000,  # OPT-3
    )
    inserted_ids = [r[0] for r in inserted_rows]

    attr_rows = [
        (eid, name[:50], val)
        for eid, attrs in zip(inserted_ids, attr_sets)
        for name, val in attrs.items()
    ]
    if attr_rows:
        execute_values(
            cur,
            "INSERT INTO Event_attribute (event_id, attribute_name, attribute_value) "
            "VALUES %s ON CONFLICT DO NOTHING",
            attr_rows, page_size=2000,  # OPT-3
        )

    return len(inserted_ids), n_xi


# statsbomb - returns {match_id → (home, away) goals} (the football-data odds generator needs it)
def run_statsbomb(conn) -> dict[int, tuple[int, int]]:
    print("\n" + "═" * 70)
    print("PIPELINE 1 — STATSBOMB")
    print("═" * 70)

    competitions = load_json(SB_COMPETITIONS)
    print(f"  Found {len(competitions)} competition/season entries")

    cur = conn.cursor()

    # TeamResolver also lives across pipelines: it'll be re-initialised in
    # the football-data pipeline so it picks up everything we insert here
    # (including any aliases we flush at the end).
    team_resolver = TeamResolver(cur)
    cache = _SBCache(cur, team_resolver)

    print(f"  Cache loaded: {len(cache.teams._by_name)} teams, "
          f"{len(cache.player)} players, {len(cache.event_type)} event types\n")

    total_linked = total_created = total_skipped = 0
    total_players = total_xi = total_events = 0
    matches_since_commit = 0

    for comp in competitions:
        comp_id = comp["competition_id"]
        season_id = comp["season_id"]
        matches_file = os.path.join(SB_MATCHES_DIR, str(comp_id), f"{season_id}.json")
        if not os.path.exists(matches_file):
            continue

        # load matches
        matches = load_json(matches_file)
        print(f"\n  -- {comp['competition_name']} / {comp['season_name']} "
              f"({len(matches)} matches) --")

        for i, sb_match in enumerate(matches, 1):
            sb_mid = sb_match["match_id"]
            home_name = sb_match["home_team"]["home_team_name"]
            away_name = sb_match["away_team"]["away_team_name"]
            print(f"    [{i}/{len(matches)}] {home_name} vs {away_name} "
                  f"({sb_match.get('match_date', '?')})", end=" ... ", flush=True)

            db_mid, was_created, match_season_id, date_str = _sb_find_or_create_match(cur, cache, sb_match, comp)
            if db_mid is None:
                print("skipped")
                total_skipped += 1
                continue
            if was_created:
                total_created += 1
                print(f"\n    [new match] id={db_mid}", end="")

            _sb_enrich_stadium(cur, db_mid, sb_match)
            # pass the competition's country so newly created Referee rows get
            # a non-NULL country; comp is already in scope from the outer loop.
            _sb_enrich_referee(cur, db_mid, sb_match, cache,
                               comp.get("country_name") or None)

            _sb_collect_coach_appearances(db_mid, sb_match, cache,
                                          match_season_id, date_str)

            n_players = 0
            lineup_path = os.path.join(SB_LINEUPS_DIR, f"{sb_mid}.json")
            if os.path.exists(lineup_path):
                n_players = _sb_import_lineups(cur, db_mid, lineup_path, cache)

            n_events = n_xi = 0
            events_path = os.path.join(SB_EVENTS_DIR, f"{sb_mid}.json")
            if os.path.exists(events_path):
                n_events, n_xi = _sb_import_events(cur, db_mid, events_path, cache)

            cache.teams.flush_aliases(cur)

            total_linked += 1
            total_players += n_players
            total_xi += n_xi
            total_events += n_events
            matches_since_commit += 1
            print(f"done ({n_players} lineup, {n_xi} XI, {n_events} events)")

            if matches_since_commit >= SB_COMMIT_EVERY:
                conn.commit()
                matches_since_commit = 0

    # finalize coach assignments
    print("\n  Finalising coach assignments (collapsing match appearances "
          "into stints)...")
    _sb_finalize_coach_assignments(cur, cache)

    conn.commit()
    cur.close()

    print("\n  -- StatsBomb Summary --")
    print(f"  Matches linked (pre-existing) : {total_linked - total_created}")
    print(f"  Matches created from SB data  : {total_created}")
    print(f"  Matches skipped               : {total_skipped}")
    print(f"  Lineup rows (lineup files)    : {total_players}")
    print(f"  Lineup rows (Starting XI)     : {total_xi}")
    print(f"  Events inserted               : {total_events}")
    print(f"  Final scores cached           : {len(cache.match_scores)}")

    return cache.match_scores


# football-data


# strip BOMs, NBSPs and weirdness and lowercase the header
def _fd_normalize_header(name: str) -> str:
    name = name.replace("﻿", "").replace("\xa0", " ")
    return re.sub(r"\s+", "", name.strip().strip('"')).lower()


def _fd_parse_date(value):
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return None
    v = str(value).strip()
    if not v:
        return None
    for fmt in ("%d/%m/%Y", "%d/%m/%y"):
        try:
            return pd.to_datetime(v, format=fmt).date()
        except ValueError:
            continue
    return None


# extracts bookmaker names that have a full triple
def _fd_bookmaker_prefixes(columns: list[str]) -> list[str]:
    col_set = set(columns)
    prefixes = []
    for col in columns:
        if re.match(r"^[a-z0-9]+h$", col) and len(col) > 1:
            prefix = col[:-1]
            if prefix and (prefix + "d") in col_set and (prefix + "a") in col_set:
                prefixes.append(prefix)
    return sorted(set(prefixes))


def _fd_read_csv(path: str) -> pd.DataFrame:
    raw = None
    used_encoding = None
    for enc in ("utf-8-sig", "cp1252"):
        try:
            with open(path, encoding=enc, newline="") as f:
                raw = f.read()
            used_encoding = enc
            break
        except UnicodeDecodeError:
            continue
    if raw is None:
        raise ValueError(f"Could not decode {path} with UTF-8 or WIN1252")
    print(f"    encoding: {used_encoding}")
    raw = raw.replace("\r", "").replace("\xa0", " ")
    lines = [l for l in raw.split("\n") if l.strip()]
    if not lines:
        return pd.DataFrame()
    header_fields = next(csv.reader([lines[0]]))
    num_cols = len(header_fields)
    rows = []
    for line in lines[1:]:
        fields = next(csv.reader([line]))
        # ad short rows with None / truncate long rows so the resulting
        if len(fields) < num_cols:
            fields += [None] * (num_cols - len(fields))
        elif len(fields) > num_cols:
            fields = fields[:num_cols]
        rows.append(fields)
    return pd.DataFrame(rows, columns=header_fields)


def _fd_clean_str(value) -> str | None:
    if value is None:
        return None
    s = str(value).strip()
    return None if s in ("", "nan", "None") else s


# returns dicts for each row
def _fd_process_file(path: str) -> list[dict]:
    df = _fd_read_csv(path)
    if df.empty:
        print("    Empty file, skipping.")
        return []

    df.columns = [_fd_normalize_header(c) for c in df.columns]
    required = {"date", "hometeam", "awayteam"}
    if required - set(df.columns):
        print(f"    Missing columns {required - set(df.columns)}, skipping.")
        return []

    bookmaker_prefixes = _fd_bookmaker_prefixes(list(df.columns))
    print(f"    bookmaker triples found: {len(bookmaker_prefixes)}")

    out: list[dict] = []
    for _, row in df.iterrows():
        match_date = _fd_parse_date(row.get("date"))
        home_team = _fd_clean_str(row.get("hometeam"))
        away_team = _fd_clean_str(row.get("awayteam"))
        referee = _fd_clean_str(row.get("referee"))

        if not match_date or not home_team or not away_team or home_team == away_team:
            continue

        # pull every bookmaker that has a complete H/D/A triple for this row
        odds = []
        for prefix in bookmaker_prefixes:
            try:
                home_win = float(row.get(prefix + "h") or 0)
                draw = float(row.get(prefix + "d") or 0)
                away_win = float(row.get(prefix + "a") or 0)
            except (ValueError, TypeError):
                continue
            if home_win > 0 and draw > 0 and away_win > 0:
                odds.append({
                    "bookmaker": prefix.upper(),
                    "home_win": home_win,
                    "draw": draw,
                    "away_win": away_win,
                })

        out.append({
            "home": home_team,
            "away": away_team,
            "date": match_date,
            "referee": referee,
            "odds": odds,
        })

    return out


# generate one full pre-match to full-time series of synthetic odds
def _fd_generate_timeseries_odds(match_id: int,
                                 match_date: date,
                                 baseline: tuple[float, float, float],
                                 score: tuple[int, int]) -> list[tuple]:
    # build synthetic odds snapshots at 0,10,...,90 minutes from baseline and final score
    p_home, p_draw, p_away = baseline
    home_goals, away_goals = score
    rows = []

    base_ts = datetime.combine(match_date, time(0, 0, 0))

    for minute in range(0, 91, 10):
        t_w = minute / 90  # 0 at kick-off, 1 at full time
        h_so_far = math.floor(home_goals * t_w)
        a_so_far = math.floor(away_goals * t_w)
        diff = h_so_far - a_so_far  # +ve when home leads

        # Home win prob raises when home is leading; effect grows with time.
        adj_h = max(0.02, min(0.97, p_home + diff * 0.08 * t_w))
        # Draw prob ↓ as |goal_diff| grows and as time runs out.
        adj_d = max(0.02, min(0.97, p_draw - abs(diff) * 0.06 * t_w))
        # Away win prob mirrors home in reverse.
        adj_a = max(0.02, min(0.97, p_away + (-diff) * 0.08 * t_w))

        total = adj_h + adj_d + adj_a
        # Renormalise (so the three probs again sum to 1) then convert to decimal odds.
        home_odds = max(1.01, round(1.0 / (adj_h / total) * 0.95, 2))
        draw_odds = max(1.01, round(1.0 / (adj_d / total) * 0.95, 2))
        away_odds = max(1.01, round(1.0 / (adj_a / total) * 0.95, 2))

        rt = base_ts + timedelta(minutes=minute)
        rows.append((match_id, "Generated", home_odds, draw_odds, away_odds, rt))

    return rows


# import football-data odds and referees then generate synthetic odds series
def run_football_data_odds(conn, match_scores: dict[int, tuple[int, int]]):
    print("\n" + "═" * 70)
    print("PIPELINE 2 — FOOTBALL-DATA (odds + referees only)")
    print("═" * 70)

    csv_files = sorted(f for f in os.listdir(FD_FOLDER) if f.lower().endswith(".csv"))
    if not csv_files:
        print(f"  No CSV files found in {FD_FOLDER}, skipping.")
        return

    cur = conn.cursor()

    team_resolver = TeamResolver(cur, manual_aliases=FD_TEAM_ALIASES)

    # build the (home_id, away_id, match_date) to match_id map once.
    cur.execute('SELECT match_id, home_team_id, away_team_id, match_date FROM "match"')
    match_lookup: dict[tuple[int, int, date], int] = {
        (h, a, d): m for m, h, a, d in cur.fetchall()
    }
    print(f"  Loaded {len(match_lookup)} existing matches for lookup.")

    # referee caches so we don't query the DB once per row.
    cur.execute("SELECT referee_id, name FROM Referee")
    ref_cache: dict[str, int] = {n: i for i, n in cur.fetchall()}
    cur.execute("SELECT referee_id, match_id FROM Referee_match")
    ref_match_pairs: set[tuple[int, int]] = {(r, m) for r, m in cur.fetchall()}

    # for referee's country we choose home team's country
    cur.execute("SELECT team_id, country FROM Team")
    team_country_map: dict[int, str | None] = {tid: ctry for tid, ctry in cur.fetchall()}

    real_odds_rows: list[tuple] = []
    referee_rows: list[tuple] = []
    matched_rows = unmatched_rows = 0
    matched_match_ids: set[int] = set()

    print(f"  Processing {len(csv_files)} CSV file(s)...")
    for filename in csv_files:
        path = os.path.join(FD_FOLDER, filename)
        print(f"  Reading: {filename}")
        for r in _fd_process_file(path):
            home_id = team_resolver.resolve(r["home"])
            away_id = team_resolver.resolve(r["away"])
            if not home_id or not away_id:
                unmatched_rows += 1
                continue
            mid = match_lookup.get((home_id, away_id, r["date"]))
            if mid is None:
                unmatched_rows += 1
                continue

            matched_rows += 1
            matched_match_ids.add(mid)

            # one Match_odds row per bookmaker.
            kickoff_ts = datetime.combine(r["date"], time(12, 0, 0))
            for o in r["odds"]:
                real_odds_rows.append((
                    mid, o["bookmaker"], o["home_win"], o["draw"], o["away_win"],
                    kickoff_ts,
                ))

            # referee linking: ensure the row exists in Referee then attach
            # it to this match if not already attached.
            if r["referee"]:
                ref_id = ref_cache.get(r["referee"])
                if ref_id is None:
                    # use the home team's country as the referee's country
                    # (fall back to the away team if home has no country set).
                    ref_country = (team_country_map.get(home_id)
                                   or team_country_map.get(away_id))
                    cur.execute(
                        "INSERT INTO Referee (name, country, date_of_birth) "
                        "VALUES (%s,%s,%s) RETURNING referee_id",
                        (r["referee"], ref_country, _random_referee_dob())
                    )
                    ref_id = cur.fetchone()[0]
                    ref_cache[r["referee"]] = ref_id
                if (ref_id, mid) not in ref_match_pairs:
                    referee_rows.append((ref_id, mid, "main"))
                    ref_match_pairs.add((ref_id, mid))

    print(f"\n  Matched FD rows to SB matches : {matched_rows}")
    print(f"  Unmatched (no team or no SB match) : {unmatched_rows}")

    cur.execute("DELETE FROM Match_odds WHERE bookmaker = 'Generated'")

    # real bookmaker rows
    if real_odds_rows:
        execute_values(
            cur,
            """INSERT INTO Match_odds (match_id, bookmaker, home_win, draw, away_win, recorded_time)
               VALUES %s
               ON CONFLICT (match_id, bookmaker, recorded_time) DO UPDATE
               SET home_win=EXCLUDED.home_win,
                   draw=EXCLUDED.draw,
                   away_win=EXCLUDED.away_win""",
            real_odds_rows, page_size=2000,
        )

    if referee_rows:
        execute_values(
            cur,
            "INSERT INTO Referee_match (referee_id, match_id, role) VALUES %s "
            "ON CONFLICT DO NOTHING",
            referee_rows, page_size=2000,  
        )

    conn.commit()

    # generated time-series
    # for every match that now has at least one real bookmaker row AND a
    # cached final score from StatsBomb, derive the kick-off implied
    # probability (averaged across bookmakers) and synthesise the 0..90 min
    # series.
    if not matched_match_ids:
        print("  No matched matches → no time-series odds to generate.")
        cur.close()
        return

    # pull per-match baseline implied probabilities + match_date in one query.
    cur.execute(
        """
        SELECT mo.match_id,
               AVG(1.0 / mo.home_win) AS p_home,
               AVG(1.0 / mo.draw)     AS p_draw,
               AVG(1.0 / mo.away_win) AS p_away,
               m.match_date
        FROM Match_odds mo
        JOIN "match"   m ON m.match_id = mo.match_id
        WHERE mo.bookmaker <> 'Generated'
          AND mo.match_id = ANY(%s)
        GROUP BY mo.match_id, m.match_date
        """,
        (list(matched_match_ids),)
    )
    baselines = cur.fetchall()

    generated: list[tuple] = []
    skipped_no_score = 0
    for mid, p_h, p_d, p_a, mdate in baselines:
        score = match_scores.get(mid)
        if score is None:
            skipped_no_score += 1
            continue
        generated.extend(_fd_generate_timeseries_odds(
            mid, mdate, (float(p_h), float(p_d), float(p_a)), score
        ))

    if generated:
        execute_values(
            cur,
            """INSERT INTO Match_odds (match_id, bookmaker, home_win, draw, away_win, recorded_time)
               VALUES %s
               ON CONFLICT (match_id, bookmaker, recorded_time) DO NOTHING""",
            generated, page_size=2000,  # OPT-3
        )

    conn.commit()
    cur.close()

    print(f"  Real bookmaker rows inserted    : {len(real_odds_rows)}")
    print(f"  Referee links inserted          : {len(referee_rows)}")
    print(f"  Generated time-series rows      : {len(generated)} "
          f"({len(generated) // 10 if generated else 0} matches × 10 snapshots)")
    if skipped_no_score:
        print(f"  Skipped (no SB final score)     : {skipped_no_score}")


# transfermarkt

# resolves transfermarkt player name to DB player_id
# order: exact name -> word match (split on space/hyphen, skip ≤ 2 chars) -> fuzzy
class _PlayerResolver:
    def __init__(self, cur):
        self._by_name: dict[str, int] = {}
        self._cache: dict[str, int | None] = {}
        self._fuzzy_list: list[tuple[str, int]] = []

        cur.execute("SELECT player_id, name FROM Player")
        for pid, name in cur.fetchall():
            self._by_name[name.lower()] = pid
            self._fuzzy_list.append((name, pid))

        print(f"  PlayerResolver: {len(self._by_name)} players loaded from DB")

    def resolve(self, tm_name: str) -> int | None:
        if not tm_name:
            return None
        if tm_name in self._cache:
            return self._cache[tm_name]
        key = tm_name.lower().strip()

        pid = self._by_name.get(key)
        if pid:
            self._cache[tm_name] = pid
            return pid

        for word in re.split(r"[\s\-]+", tm_name):
            if len(word) <= 2:
                continue
            pid = self._by_name.get(word.lower())
            if pid:
                print(f"    [word] '{tm_name}' → '{word}' (id={pid})")
                self._cache[tm_name] = pid
                return pid

        if self._fuzzy_list:
            result = fuzz_process.extractOne(
                tm_name,
                [n for n, _ in self._fuzzy_list],
                scorer=fuzz.token_sort_ratio
            )
            if result and result[1] >= PLAYER_FUZZY_THRESHOLD:
                best_name, best_score = result[0], result[1]
                pid = next(p for n, p in self._fuzzy_list if n == best_name)
                print(f"    [fuzzy] '{tm_name}' → '{best_name}' (score={best_score})")
                self._cache[tm_name] = pid
                return pid

        self._cache[tm_name] = None
        return None


# map messy transfer_fee text into canonical transfer type labels◘


def _tm_classify_transfer_type(row) -> str:
    fee = row.get("transfer_fee", None)
    fee_str = str(fee).strip().lower() if pd.notna(fee) else ""
    if fee_str in ("", "nan", "none", "-", "?"):
        return "Unknown"
    if fee_str in ("0", "0.0"):
        return "Free Transfer"
    if "loan" in fee_str and "end" in fee_str:
        return "End of Loan"
    if "loan" in fee_str:
        return "Loan"
    return "Permanent"


def _tm_get_or_create_transfer_type(cur, cache: dict[str, int], type_name: str) -> int:
    if type_name in cache:
        return cache[type_name]
    cur.execute("SELECT transfer_type_id FROM Transfer_type WHERE type=%s LIMIT 1", (type_name,))
    row = cur.fetchone()
    if row:
        cache[type_name] = row[0]
    else:
        cur.execute(
            "INSERT INTO Transfer_type (type) VALUES (%s) RETURNING transfer_type_id",
            (type_name,)
        )
        cache[type_name] = cur.fetchone()[0]
    return cache[type_name]


def _tm_parse_fee(raw) -> float | None:
    if raw is None or (isinstance(raw, float) and pd.isna(raw)):
        return None
    try:
        return max(float(raw), 0.0)
    except (ValueError, TypeError):
        return None


_season_resolve_cache: dict[str, int | None] = {}


def _tm_resolve_season(cur, season_str: str) -> int | None:
    if season_str in _season_resolve_cache:
        return _season_resolve_cache[season_str]
    candidates = [season_str]
    if re.match(r"^\d{4}$", season_str):
        y = int(season_str)
        candidates += [f"{y - 1}/{str(y)[2:]}", f"{y}/{y + 1}", f"{str(y)[2:]}/{str(y + 1)[2:]}"]
    m = re.match(r"^(\d{4})/(\d{4})$", season_str)
    if m:
        y1, y2 = m.group(1), m.group(2)
        candidates += [f"{y1}/{y2[2:]}", f"{y1[2:]}/{y2[2:]}"]
    for c in candidates:
        cur.execute("SELECT season_id FROM Season WHERE season_name=%s LIMIT 1", (c,))
        row = cur.fetchone()
        if row:
            _season_resolve_cache[season_str] = row[0]
            return row[0]
    _season_resolve_cache[season_str] = None
    return None


def _tm_build_player_map(players_df: pd.DataFrame, resolver: _PlayerResolver
                         ) -> dict[int, int]:
    mapping: dict[int, int] = {}
    matched = 0
    for _, row in players_df.iterrows():
        tm_id = int(row["player_id"])
        tm_name = str(row.get("name", "")).strip()
        db_id = resolver.resolve(tm_name)
        if db_id:
            mapping[tm_id] = db_id
            matched += 1
    print(f"  Player mapping: {matched}/{len(players_df)} TM players matched")
    return mapping


def _tm_import_valuations(cur, df: pd.DataFrame, player_map: dict[int, int]) -> int:
    rows = []
    skipped_player = skipped_season = skipped_fee = 0
    for _, row in df.iterrows():
        db_pid = player_map.get(int(row["player_id"]))
        if not db_pid:
            skipped_player += 1
            continue
        raw_date = row.get("date")
        if pd.isna(raw_date) or not raw_date:
            skipped_season += 1
            continue
        val_date = pd.to_datetime(raw_date, errors="coerce")
        if pd.isna(val_date):
            skipped_season += 1
            continue
        mv = _tm_parse_fee(row.get("market_value_in_eur"))
        if mv is None:
            skipped_fee += 1
            continue
        year = val_date.year
        season_id = _tm_resolve_season(cur, f"{year - 1}/{str(year)[2:]}") or \
                    _tm_resolve_season(cur, str(year))
        if not season_id:
            skipped_season += 1
            continue
        rows.append((db_pid, season_id, mv, val_date.date()))

    if rows:
        execute_values(
            cur,
            "INSERT INTO Player_valuation (player_id, season_id, market_value, valuation_date) "
            "VALUES %s ON CONFLICT DO NOTHING",
            rows, page_size=2000,  # OPT-3
        )
    print(f"  Valuations: {len(rows)} inserted | "
          f"{skipped_player} no player | {skipped_season} no season | {skipped_fee} no value")
    return len(rows)


def _tm_import_transfers(cur, df: pd.DataFrame, player_map: dict[int, int],
                         team_resolver: TeamResolver) -> int:
    type_cache: dict[str, int] = {}
    rows = []
    skipped_player = skipped_team = skipped_date = skipped_same = 0

    for _, row in df.iterrows():
        db_pid = player_map.get(int(row["player_id"]))
        if not db_pid:
            skipped_player += 1
            continue
        raw_date = row.get("transfer_date")
        if pd.isna(raw_date) or not raw_date:
            skipped_date += 1
            continue
        transfer_date = pd.to_datetime(raw_date, errors="coerce")
        if pd.isna(transfer_date):
            skipped_date += 1
            continue

        from_id = team_resolver.resolve(str(row.get("from_club_name", "") or "").strip())
        to_id = team_resolver.resolve(str(row.get("to_club_name", "") or "").strip())
        if not from_id or not to_id:
            skipped_team += 1
            continue
        if from_id == to_id:
            skipped_same += 1
            continue

        fee = _tm_parse_fee(row.get("transfer_fee")) or 0.0
        type_id = _tm_get_or_create_transfer_type(
            cur, type_cache, _tm_classify_transfer_type(row)
        )
        rows.append((db_pid, from_id, to_id, transfer_date.date(), fee, type_id))

    if rows:
        execute_values(
            cur,
            "INSERT INTO Transfer "
            "(player_id, from_team_id, to_team_id, transfer_date, fee, transfer_type_id) "
            "VALUES %s ON CONFLICT DO NOTHING",
            rows, page_size=2000,  # OPT-3
        )
    print(f"  Transfers: {len(rows)} inserted | "
          f"{skipped_player} no player | {skipped_team} no team | "
          f"{skipped_date} bad date | {skipped_same} same from/to")
    return len(rows)


def run_transfermarkt(conn):
    print("\n" + "═" * 70)
    print("PIPELINE 3 — TRANSFERMARKT")
    print("═" * 70)

    print("  Loading CSVs...")
    players_df = pd.read_csv(TM_PLAYERS, low_memory=False)
    valuations_df = pd.read_csv(TM_VALUATIONS, low_memory=False)
    transfers_df = pd.read_csv(TM_TRANSFERS, low_memory=False)
    print(f"    players: {len(players_df):,} | "
          f"valuations: {len(valuations_df):,} | transfers: {len(transfers_df):,}")

    cur = conn.cursor()
    resolver = _PlayerResolver(cur)
    team_resolver = TeamResolver(cur)
    player_map = _tm_build_player_map(players_df, resolver)

    print("  Importing valuations...")
    _tm_import_valuations(cur, valuations_df, player_map)
    conn.commit()

    print("  Importing transfers...")
    _tm_import_transfers(cur, transfers_df, player_map, team_resolver)
    conn.commit()

    cur.close()


def main():
    print("PitchMap Master Importer")
    print("Pipelines: statsbomb → football-data (odds) → transfermarkt\n")

    conn = psycopg2.connect(**DB_CONFIG)
    # disable synchronous_commit for the session so Postgres doesn't fsync after every commit
    cur = conn.cursor()
    cur.execute("SET synchronous_commit = off")
    conn.commit()
    cur.close()
    try:
        match_scores = run_statsbomb(conn)

        run_football_data_odds(conn, match_scores)

        run_transfermarkt(conn)
    finally:
        conn.close()

    print("\n" + "═" * 70)
    print("All pipelines complete.")
    print("═" * 70)


if __name__ == "__main__":
    main()
