Context Navigation

Back to DatabaseCreation

DatabaseCreation: seeding_script.py

File seeding_script.py, 55.5 KB (added by 231091, 5 hours ago)

Line
1	"""
2	1. StatsBomb — single source of truth for matches, lineups, events, coaches, stadiums, and players.
3
4	- Players carry a randomly generated date_of_birth
5	- Stadiums get a randomly generated capacity and year_built
6	- Coach appearances are collected during the import
7	(one tuple per match per coach) and then collapsed
8	into proper stints with realistic start/end dates
9	and a generated salary in a single finalisation
10	step.
11
12	2. Football-data — bookmaker odds and referee assignments for the StatsBomb matches that are already in the DB.
13
14	- A 'Generated' time-series of odds is produced
15	for every match at minutes 0, 10, 20, ..., 90 using
16	the StatsBomb final score.
17
18	3. Transfermarkt — player valuations and transfers (unchanged).
19
20	"""
21
22	import csv
23	import json
24	import math
25	import os
26	import random
27	import re
28	from datetime import date, datetime, time, timedelta
29
30	import pandas as pd
31	import psycopg2
32	from psycopg2.extras import execute_values
33	from rapidfuzz import process as fuzz_process, fuzz
34
35	DB_CONFIG = {
36	"host": "<host>",
37	"port": "<port>",
38	"dbname": "<dbname>",
39	"user": "<user>",
40	"password": "<password>",
41	}
42
43	# football-data.co.uk CSV folder (one CSV per division/season)
44	FD_FOLDER = "<fd_root>"
45
46	# Transfermarkt Kaggle dataset folder
47	TM_ROOT = "<transfermarkt_root>"
48	TM_PLAYERS = f"{TM_ROOT}/players.csv"
49	TM_VALUATIONS = f"{TM_ROOT}/player_valuations.csv"
50	TM_TRANSFERS = f"{TM_ROOT}/transfers.csv"
51
52	# StatsBomb open-data repo folder
53	SB_ROOT = "<statsbomb_root>"
54	SB_COMPETITIONS = os.path.join(SB_ROOT, "data", "competitions.json")
55	SB_MATCHES_DIR = os.path.join(SB_ROOT, "data", "matches")
56	SB_LINEUPS_DIR = os.path.join(SB_ROOT, "data", "lineups")
57	SB_EVENTS_DIR = os.path.join(SB_ROOT, "data", "events")
58
59	FD_TEAM_ALIASES: dict[str, str] = {
60	"Hamburg": "Hamburger SV",
61	"M'gladbach": "Borussia Mönchengladbach",
62	"West Ham": "West Ham United",
63	"Wolves": "Wolverhampton Wanderers",
64	"Man City": "Manchester City",
65	"West Brom": "West Bromwich Albion",
66	"Man United": "Manchester United",
67	"Paris SG": "Paris Saint-Germain",
68	}
69
70	TEAM_FUZZY_THRESHOLD = 80 # team name matching
71	PLAYER_FUZZY_THRESHOLD = 90 # stricter — player names collide more easily
72
73	# StatsBomb matches to process between DB commits.
74	SB_COMMIT_EVERY = 500
75
76	PLAYER_DOB_START = date(1975, 1, 1)
77	PLAYER_DOB_END = date(2008, 12, 31)
78
79	STADIUM_CAPACITY_MIN = 8_000
80	STADIUM_CAPACITY_MAX = 90_000
81
82	STADIUM_YEAR_MIN = 1900
83	STADIUM_YEAR_MAX = 2015
84
85	COACH_SALARY_MIN = 500_000
86	COACH_SALARY_MAX = 10_000_000
87
88	COACH_STILL_ACTIVE_DAYS = 180 # 6 months
89
90	COACH_STINT_GAP_DAYS = 90
91
92	COACH_STINT_END_BUFFER_DAYS = 30
93
94	REFEREE_DOB_START = date(1955, 1, 1)
95	REFEREE_DOB_END = date(1985, 12, 31)
96
97
98	# pick a random player date of birth within configured bounds
99	def _random_player_dob() -> date:
100	span = (PLAYER_DOB_END - PLAYER_DOB_START).days
101	return PLAYER_DOB_START + timedelta(days=random.randint(0, span))
102
103
104	# pick a stadium capacity in the configured range
105	def _random_stadium_capacity() -> int:
106	return random.randint(STADIUM_CAPACITY_MIN, STADIUM_CAPACITY_MAX)
107
108
109	# pick a plausible stadium build year
110	def _random_stadium_year() -> int:
111	return random.randint(STADIUM_YEAR_MIN, STADIUM_YEAR_MAX)
112
113
114	# pick a random coach salary and round to nearest 1000
115	def _random_coach_salary() -> int:
116	return round(random.randint(COACH_SALARY_MIN, COACH_SALARY_MAX), -3)
117
118
119	# pick a random referee date of birth within configured bounds
120	def _random_referee_dob() -> date:
121	span = (REFEREE_DOB_END - REFEREE_DOB_START).days
122	return REFEREE_DOB_START + timedelta(days=random.randint(0, span))
123
124
125	def load_json(path: str):
126	with open(path, encoding="utf-8") as f:
127	return json.load(f)
128
129
130	# resolves team name to db team_id
131	# order: cache -> exact match -> alias -> single-word match -> fuzzy -> new team
132	class TeamResolver:
133	def __init__(self, cur, manual_aliases: dict[str, str] \| None = None):
134	self._by_name: dict[str, int] = {}
135	self._by_alias: dict[str, int] = {}
136	self._fuzzy_list: list[tuple[str, int]] = []
137	self._resolve_cache: dict[str, int \| None] = {}
138	self._pending_aliases: list[tuple[int, str]] = []
139
140	cur.execute("SELECT team_id, name FROM Team")
141	for tid, name in cur.fetchall():
142	self._by_name[name.lower()] = tid
143	self._fuzzy_list.append((name, tid))
144
145	cur.execute("SELECT team_id, alias_name FROM Team_alias")
146	for tid, alias in cur.fetchall():
147	self._by_alias[alias.lower()] = tid
148
149	if manual_aliases:
150	for src, dst in manual_aliases.items():
151	tid = self._by_name.get(dst.lower())
152	if tid:
153	self._by_alias[src.lower()] = tid
154	else:
155	print(f" [alias warning] '{dst}' not found in Team table")
156
157	def resolve(self, name: str, country: str \| None = None,
158	cur=None, create_if_missing: bool = False) -> int \| None:
159	if not name:
160	return None
161	if name in self._resolve_cache:
162	return self._resolve_cache[name]
163
164	key = name.lower()
165
166	# 1. exact name
167	if key in self._by_name:
168	tid = self._by_name[key]
169	self._resolve_cache[name] = tid
170	return tid
171
172	# 2. alias
173	if key in self._by_alias:
174	tid = self._by_alias[key]
175	self._resolve_cache[name] = tid
176	return tid
177
178	# 3. word match (e.g. "FC Barcelona" → "Barcelona")
179	for word in name.split():
180	if len(word) <= 2:
181	continue
182	w = word.lower()
183	tid = self._by_name.get(w) or self._by_alias.get(w)
184	if tid:
185	print(f" [word match] '{name}' → '{word}' (team_id={tid})")
186	self._pending_aliases.append((tid, name))
187	self._by_alias[key] = tid
188	self._resolve_cache[name] = tid
189	return tid
190
191	# 4. fuzzy match
192	is_womens = "women" in key
193	candidates = [
194	(n, t) for n, t in self._fuzzy_list
195	if ("women" in n.lower()) == is_womens
196	]
197	if candidates:
198	result = fuzz_process.extractOne(
199	name,
200	[n for n, _ in candidates],
201	scorer=fuzz.token_sort_ratio
202	)
203	if result and result[1] >= TEAM_FUZZY_THRESHOLD:
204	best_name, best_score = result[0], result[1]
205	tid = next(t for n, t in candidates if n == best_name)
206	print(f" [fuzzy] '{name}' → '{best_name}' (score {best_score})")
207	self._pending_aliases.append((tid, name))
208	self._by_alias[key] = tid
209	self._resolve_cache[name] = tid
210	return tid
211
212	# 5. create new
213	if create_if_missing and cur is not None:
214	print(f" [new team] inserting '{name}' (country={country})")
215	cur.execute(
216	"INSERT INTO Team (name, country) VALUES (%s, %s) RETURNING team_id",
217	(name[:100], country)
218	)
219	tid = cur.fetchone()[0]
220	self._by_name[key] = tid
221	self._fuzzy_list.append((name, tid))
222	self._resolve_cache[name] = tid
223	return tid
224
225	print(f" [no match] '{name}'")
226	self._resolve_cache[name] = None
227	return None
228
229	def flush_aliases(self, cur):
230	if not self._pending_aliases:
231	return
232	execute_values(
233	cur,
234	"INSERT INTO Team_alias (team_id, alias_name) VALUES %s ON CONFLICT DO NOTHING",
235	self._pending_aliases,
236	page_size=2000, # OPT-3
237	)
238	self._pending_aliases.clear()
239
240
241	# statsbomb
242
243	class _SBCache:
244	def __init__(self, cur, team_resolver: TeamResolver):
245	self.teams = team_resolver # shared TeamResolver instance
246	self.player: dict[str, int] = {}
247	self.event_type: dict[str, int] = {}
248	self.referee: dict[str, int] = {}
249	self.coach: dict[str, int] = {}
250	self.division: dict[tuple, int] = {}
251	self.season: dict[tuple, int] = {}
252	self.stadium: dict[tuple, int] = {}
253	self.referee_match_pairs: set[tuple[int, int]] = set()
254
255	# per-match coach appearances, finalised into stints at the end.
256	# each entry is (coach_id, team_id, season_id, match_date).
257	self.coach_appearances: list[tuple[int, int, int, date]] = []
258
259	# final scores keyed by DB match_id, used by the football-data
260	# pipeline to drive the generated odds time-series.
261	self.match_scores: dict[int, tuple[int, int]] = {}
262
263	cur.execute("SELECT player_id, name FROM Player")
264	for pid, name in cur.fetchall():
265	self.player[name] = pid
266
267	cur.execute("SELECT event_type_id, type FROM Event_type")
268	for etid, t in cur.fetchall():
269	self.event_type[t] = etid
270
271	cur.execute("SELECT referee_id, name FROM Referee")
272	for rid, name in cur.fetchall():
273	self.referee[name] = rid
274
275	cur.execute("SELECT coach_id, name FROM Coach")
276	for cid, name in cur.fetchall():
277	self.coach[name] = cid
278
279	cur.execute("SELECT referee_id, match_id FROM Referee_match")
280	for rid, mid in cur.fetchall():
281	self.referee_match_pairs.add((rid, mid))
282
283	cur.execute("SELECT division_id, name, country FROM Division")
284	for did, name, country in cur.fetchall():
285	self.division[(name, country)] = did
286
287	cur.execute("SELECT season_id, division_id, season_name FROM Season")
288	for sid, did, sname in cur.fetchall():
289	self.season[(did, sname)] = sid
290
291	cur.execute("SELECT stadium_id, name, country FROM Stadium")
292	for stid, name, country in cur.fetchall():
293	self.stadium[(name, country)] = stid
294
295
296	def ensure_division(self, cur, name: str, country: str \| None) -> int:
297	key = (name, country)
298	if key in self.division:
299	return self.division[key]
300	cur.execute(
301	"INSERT INTO Division (name, country) VALUES (%s,%s) RETURNING division_id",
302	(name, country)
303	)
304	did = cur.fetchone()[0]
305	self.division[key] = did
306	return did
307
308	def ensure_season(self, cur, division_id: int, season_name: str) -> int:
309	key = (division_id, season_name)
310	if key in self.season:
311	return self.season[key]
312	cur.execute(
313	"SELECT season_id FROM Season WHERE division_id=%s AND season_name=%s LIMIT 1",
314	(division_id, season_name)
315	)
316	row = cur.fetchone()
317	if row is None:
318	cur.execute(
319	"INSERT INTO Season (division_id, season_name) VALUES (%s,%s) RETURNING season_id",
320	(division_id, season_name)
321	)
322	row = cur.fetchone()
323	self.season[key] = row[0]
324	return row[0]
325
326	def ensure_stadium(self, cur, name: str, country: str \| None) -> int:
327	# get existing stadium or insert one with generated capacity/year when missing
328	key = (name, country)
329	if key in self.stadium:
330	return self.stadium[key]
331	cur.execute(
332	"SELECT stadium_id FROM Stadium WHERE name=%s "
333	"AND (country=%s OR (country IS NULL AND %s IS NULL)) LIMIT 1",
334	(name, country, country)
335	)
336	row = cur.fetchone()
337	if row:
338	sid = row[0]
339	else:
340	# random reasonable values for the columns the source data lacks.
341	cur.execute(
342	"INSERT INTO Stadium (name, country, capacity, year_built) "
343	"VALUES (%s,%s,%s,%s) RETURNING stadium_id",
344	(name[:100], country, _random_stadium_capacity(), _random_stadium_year())
345	)
346	sid = cur.fetchone()[0]
347	self.stadium[key] = sid
348	return sid
349
350
351	def ensure_players(self, cur, names: list[str]):
352	# batch-insert unseen players and cache their ids
353	new = [(n, _random_player_dob())
354	for n in {n for n in names if n and n not in self.player}]
355	if not new:
356	return
357	rows = execute_values(
358	cur,
359	"INSERT INTO Player (name, date_of_birth) VALUES %s RETURNING player_id, name",
360	new, fetch=True, page_size=500, # OPT-3
361	)
362	for pid, name in rows:
363	self.player[name] = pid
364
365	def ensure_event_types(self, cur, type_names: list[str]):
366	new = list({t for t in type_names if t and t not in self.event_type})
367	if not new:
368	return
369	rows = execute_values(
370	cur,
371	"INSERT INTO Event_type (type) VALUES %s "
372	"ON CONFLICT (type) DO UPDATE SET type=EXCLUDED.type "
373	"RETURNING event_type_id, type",
374	[(t,) for t in new], fetch=True, page_size=2000, # OPT-3
375	)
376	for etid, t in rows:
377	self.event_type[t] = etid
378
379	def ensure_referee(self, cur, name: str, country: str \| None = None) -> int:
380	# get existing referee or insert a new one with generated dob
381	if name in self.referee:
382	return self.referee[name]
383	cur.execute("SELECT referee_id FROM Referee WHERE name=%s LIMIT 1", (name,))
384	row = cur.fetchone()
385	if not row:
386	cur.execute(
387	"INSERT INTO Referee (name, country, date_of_birth) "
388	"VALUES (%s,%s,%s) RETURNING referee_id",
389	(name, country, _random_referee_dob())
390	)
391	row = cur.fetchone()
392	self.referee[name] = row[0]
393	return row[0]
394
395	def ensure_coach(self, cur, name: str, nationality: str \| None) -> int:
396	if name in self.coach:
397	return self.coach[name]
398	cur.execute("SELECT coach_id FROM Coach WHERE name=%s LIMIT 1", (name,))
399	row = cur.fetchone()
400	if not row:
401	cur.execute(
402	"INSERT INTO Coach (name, nationality) VALUES (%s,%s) RETURNING coach_id",
403	(name, nationality)
404	)
405	row = cur.fetchone()
406	self.coach[name] = row[0]
407	return row[0]
408
409
410	def _sb_find_or_create_match(cur, cache: _SBCache, sb_match: dict, comp: dict
411	) -> tuple[int \| None, bool, int \| None, str \| None]:
412	# find matching db match row or create it and cache the final score
413	date_str = sb_match.get("match_date")
414	if not date_str:
415	return None, False, None, None
416
417	home_sb = sb_match["home_team"]["home_team_name"]
418	away_sb = sb_match["away_team"]["away_team_name"]
419	country = comp.get("country_name") or None
420
421	comp_name = comp.get("competition_name", "Unknown Competition")
422	division_id = cache.ensure_division(cur, comp_name[:100], country)
423	season_id = cache.ensure_season(cur, division_id, comp.get("season_name", "Unknown")[:20])
424
425	home_id = cache.teams.resolve(home_sb, country=country, cur=cur, create_if_missing=True)
426	away_id = cache.teams.resolve(away_sb, country=country, cur=cur, create_if_missing=True)
427
428	if not home_id or not away_id or home_id == away_id:
429	if home_id and away_id and home_id == away_id:
430	print(f" [skip] same team resolved for both sides (id={home_id})")
431	return None, False, None, None
432
433	sb_stadium = sb_match.get("stadium")
434	stadium_name = (sb_stadium.get("name", "").strip() if sb_stadium else "") \
435	or f"Unknown home stadium - {home_sb}"
436	stadium_id = cache.ensure_stadium(cur, stadium_name[:100], country)
437
438	cur.execute(
439	'SELECT match_id FROM "match" '
440	'WHERE home_team_id=%s AND away_team_id=%s AND match_date=%s LIMIT 1',
441	(home_id, away_id, date_str)
442	)
443	row = cur.fetchone()
444	if row:
445	match_id = row[0]
446	was_created = False
447	else:
448	cur.execute(
449	'INSERT INTO "match" (season_id, home_team_id, away_team_id, stadium_id, match_date) '
450	'VALUES (%s,%s,%s,%s,%s) RETURNING match_id',
451	(season_id, home_id, away_id, stadium_id, date_str)
452	)
453	match_id = cur.fetchone()[0]
454	was_created = True
455
456	# cache the final score for the football-data odds generator.
457	home_score = int(sb_match.get("home_score", 0) or 0)
458	away_score = int(sb_match.get("away_score", 0) or 0)
459	cache.match_scores[match_id] = (home_score, away_score)
460
461	return match_id, was_created, season_id, date_str
462
463
464	# replace placeholder stadium names when statsbomb later provides the real one
465	def _sb_enrich_stadium(cur, match_id: int, sb_match: dict):
466	sb_stadium = sb_match.get("stadium")
467	if not sb_stadium:
468	return
469	real_name = sb_stadium.get("name", "").strip()
470	if not real_name:
471	return
472	cur.execute(
473	"""UPDATE Stadium s SET name=%s FROM "match" m
474	WHERE m.match_id=%s AND m.stadium_id=s.stadium_id
475	AND s.name LIKE 'Unknown home stadium - %%'""",
476	(real_name[:100], match_id)
477	)
478
479
480	def _sb_enrich_referee(cur, match_id: int, sb_match: dict, cache: _SBCache,
481	country: str \| None = None):
482	# ensure referee exists and link them to the match
483	sb_ref = sb_match.get("referee")
484	if not sb_ref:
485	return
486	ref_name = sb_ref.get("name", "").strip()
487	if not ref_name:
488	return
489
490	# treat "International" as unusable — use the
491	# participating team's country instead.
492	if not country or "international" in country.lower():
493	home_team = sb_match.get("home_team") or {}
494	away_team = sb_match.get("away_team") or {}
495	country = (
496	(home_team.get("country") or {}).get("name")
497	or (away_team.get("country") or {}).get("name")
498	) or None
499
500	ref_id = cache.ensure_referee(cur, ref_name, country)
501	if (ref_id, match_id) in cache.referee_match_pairs:
502	return
503	cur.execute(
504	"INSERT INTO Referee_match (referee_id, match_id, role) "
505	"VALUES (%s,%s,'main') ON CONFLICT DO NOTHING",
506	(ref_id, match_id)
507	)
508	cache.referee_match_pairs.add((ref_id, match_id))
509
510
511	# collect coach appearances in memory for later stint building
512	def _sb_collect_coach_appearances(match_id: int, sb_match: dict, cache: _SBCache,
513	season_id: int, match_date_str: str):
514	match_date = date.fromisoformat(match_date_str)
515
516	for side in ("home_team", "away_team"):
517	team_data = sb_match.get(side, {})
518	team_id = cache.teams.resolve(team_data.get(f"{side}_name", ""))
519	if not team_id:
520	continue
521	for mgr in team_data.get("managers", []):
522	name = (mgr.get("name", "") or "").strip()
523	if not name:
524	continue
525	nationality = (mgr.get("country", {}).get("name", "") or "").strip() or None
526
527	cache.coach_appearances.append((name, nationality, team_id, season_id, match_date))
528
529
530	# collapse collected coach appearances into stint rows
531	def _sb_finalize_coach_assignments(cur, cache: _SBCache):
532	if not cache.coach_appearances:
533	print(" No coach appearances to finalise.")
534	return
535
536	# resolve / create coach rows in bulk before building the stint map.
537	# appearances stored as (name, nationality, team_id, season_id, match_date)
538	resolved: list[tuple[int, int, int, date]] = []
539	for name, nationality, team_id, season_id, match_date in cache.coach_appearances:
540	coach_id = cache.ensure_coach(cur, name, nationality)
541	resolved.append((coach_id, team_id, season_id, match_date))
542
543	# group by (coach_id, team_id) — one bucket per potential stint sequence.
544	groups: dict[tuple[int, int], list[tuple[int, date]]] = {}
545	for cid, tid, sid, d in resolved:
546	groups.setdefault((cid, tid), []).append((sid, d))
547
548
549	if groups:
550	pair_cids = [cid for cid, _ in groups.keys()]
551	pair_tids = [tid for _, tid in groups.keys()]
552	cur.execute(
553	"DELETE FROM Coach_assignment "
554	"WHERE (coach_id, team_id) IN "
555	"(SELECT * FROM UNNEST(%s::int[], %s::int[]))",
556	(pair_cids, pair_tids)
557	)
558
559	rows: list[tuple] = []
560	today = date.today()
561	for (cid, tid), entries in groups.items():
562	# sort by date so the linear walk below detects gaps correctly.
563	entries.sort(key=lambda x: x[1])
564
565	# split entries into stints based on gap-from-previous.
566	# each stint is itself a list of (season_id, match_date) tuples.
567	stints: list[list[tuple[int, date]]] = []
568	for entry in entries:
569	_, d = entry
570	if not stints or (d - stints[-1][-1][1]).days > COACH_STINT_GAP_DAYS:
571	stints.append([entry])
572	else:
573	stints[-1].append(entry)
574
575	# one Coach_assignment row per stint with proper boundaries.
576	for i, stint in enumerate(stints):
577	start_season = stint[0][0]
578	start_d = stint[0][1]
579	last_d = stint[-1][1]
580
581	if i + 1 < len(stints):
582	# there's another stint: this one ends the day before it starts.
583	end_d = stints[i + 1][0][1] - timedelta(days=1)
584	elif (today - last_d).days <= COACH_STILL_ACTIVE_DAYS:
585	# most recent stint and last match is recent → still active.
586	end_d = None
587	else:
588	# closed historical stint; pad with a small buffer.
589	end_d = last_d + timedelta(days=COACH_STINT_END_BUFFER_DAYS)
590
591	rows.append((cid, start_season, tid, _random_coach_salary(), start_d, end_d))
592
593	if rows:
594	execute_values(
595	cur,
596	"INSERT INTO Coach_assignment "
597	"(coach_id, season_id, team_id, assignment_value, start_date, end_date) "
598	"VALUES %s",
599	rows, page_size=2000, # OPT-3
600	)
601	print(f" Coach stints inserted: {len(rows)} "
602	f"(from {len(cache.coach_appearances)} per-match appearances)")
603
604
605	# insert lineup file players into lineup rows
606	def _sb_import_lineups(cur, match_id: int, lineup_path: str, cache: _SBCache) -> int:
607	lineups = load_json(lineup_path)
608	inserted = 0
609	for team_entry in lineups:
610	team_id = cache.teams.resolve(team_entry.get("team_name", ""))
611	if not team_id:
612	continue
613	cache.ensure_players(
614	cur, [p.get("player_name", "").strip() for p in team_entry.get("lineup", [])]
615	)
616	lineup_rows = []
617	for p in team_entry.get("lineup", []):
618	player_name = p.get("player_name", "").strip()
619	if not player_name or player_name not in cache.player:
620	continue
621	positions = p.get("positions", [])
622	position = positions[0].get("position", "Unknown") if positions else "Unknown"
623	shirt_no = p.get("jersey_number")
624	is_starter = any(pos.get("start_reason") == "Starting XI" for pos in positions) \
625	if positions else True
626	if shirt_no is None:
627	continue
628	shirt_no = int(shirt_no)
629	if shirt_no < 1:
630	continue
631	lineup_rows.append(
632	(match_id, team_id, cache.player[player_name],
633	shirt_no, position[:50], is_starter)
634	)
635	if lineup_rows:
636	execute_values(
637	cur,
638	"INSERT INTO Lineup (match_id, team_id, player_id, shirt_number, position, is_starter) "
639	"VALUES %s ON CONFLICT DO NOTHING",
640	lineup_rows, page_size=2000, # OPT-3
641	)
642	inserted += len(lineup_rows)
643	return inserted
644
645
646	# StatsBomb event-attribute keys we deliberately drop when flattening
647	_SB_SKIP_ATTR_KEYS = {
648	"id", "index", "period", "timestamp", "minute", "second",
649	"type", "possession", "possession_team", "play_pattern",
650	"team", "player", "position", "location", "match_id",
651	"related_events", "off_camera", "out",
652	}
653
654	# don't save this attributes to keep event_attribute < 30M
655	_SB_SKIP_FULL_KEYS = {"duration", "pass.angle"}
656
657
658	# flatten nested even◘t payload into string key/value attributes
659	def _sb_flatten_attrs(event: dict, prefix: str = "") -> dict[str, str]:
660	result = {}
661	for k, v in event.items():
662	full_key = f"{prefix}{k}" if prefix else k
663	if not prefix and k in _SB_SKIP_ATTR_KEYS:
664	continue
665	if full_key in _SB_SKIP_FULL_KEYS:
666	continue
667	if isinstance(v, dict):
668	if "name" in v:
669	result[full_key] = str(v["name"])
670	else:
671	result.update(_sb_flatten_attrs(v, prefix=full_key + "."))
672	elif isinstance(v, list):
673	result[full_key] = json.dumps(v)
674	elif v is not None:
675	result[full_key] = str(v)
676	return result
677
678
679	# build lineup rows from starting xi events when lineup files are missing
680	def _sb_lineup_from_starting_xi(ev: dict, match_id: int, cache: _SBCache) -> list[tuple]:
681	try:
682	team_id = cache.teams.resolve(ev.get("team", {}).get("name", ""))
683	if not team_id:
684	return []
685	sb_lineup = ev.get("tactics", {}).get("lineup", [])
686	if not sb_lineup:
687	return []
688	rows = []
689	for entry in sb_lineup:
690	try:
691	player_name = entry["player"]["name"].strip()
692	shirt_no = int(entry["jersey_number"])
693	position = entry.get("position", {}).get("name", "Unknown") or "Unknown"
694	except (KeyError, TypeError, ValueError):
695	continue
696	if not player_name or shirt_no < 1:
697	continue
698	player_id = cache.player.get(player_name)
699	if not player_id:
700	continue
701	rows.append((match_id, team_id, player_id, shirt_no, position[:50], True))
702	return rows
703	except Exception:
704	return []
705
706
707	# insert non-starting-xi events and return counts for events and xi rows
708	def _sb_import_events(cur, match_id: int, events_path: str, cache: _SBCache
709	) -> tuple[int, int]:
710	events = load_json(events_path)
711
712	# pre-create any new players (regular event participants + Starting XI).
713	cache.ensure_players(cur, [ev.get("player", {}).get("name", "") for ev in events])
714	cache.ensure_players(cur, [
715	entry["player"]["name"].strip()
716	for ev in events
717	if ev.get("type", {}).get("name") == "Starting XI"
718	for entry in ev.get("tactics", {}).get("lineup", [])
719	if isinstance(entry.get("player"), dict) and entry["player"].get("name", "").strip()
720	])
721	cache.ensure_event_types(cur, [ev.get("type", {}).get("name", "") for ev in events])
722
723	event_rows: list[tuple] = []
724	attr_sets: list[dict] = []
725	xi_lineup_rows: list[tuple] = []
726
727	for ev in events:
728	ev_type_name = ev.get("type", {}).get("name", "").strip()
729	if not ev_type_name:
730	continue
731	# starting XI events synthesise lineup rows rather than event rows.
732	if ev_type_name == "Starting XI":
733	xi_lineup_rows.extend(_sb_lineup_from_starting_xi(ev, match_id, cache))
734	continue
735	if ev_type_name not in cache.event_type:
736	continue
737	team_id = cache.teams.resolve(ev.get("team", {}).get("name", ""))
738	if not team_id:
739	continue
740	player_name = ev.get("player", {}).get("name", "")
741	if not player_name or player_name not in cache.player:
742	continue
743	event_rows.append((
744	match_id, team_id, cache.player[player_name], cache.event_type[ev_type_name],
745	max(0, min(130, int(ev.get("minute", 0)))),
746	max(0, min(59, int(ev.get("second", 0)))),
747	))
748	attr_sets.append(_sb_flatten_attrs(ev))
749
750	n_xi = 0
751	if xi_lineup_rows:
752	execute_values(
753	cur,
754	"INSERT INTO Lineup (match_id, team_id, player_id, shirt_number, position, is_starter) "
755	"VALUES %s ON CONFLICT DO NOTHING",
756	xi_lineup_rows, page_size=2000, # OPT-3
757	)
758	n_xi = len(xi_lineup_rows)
759
760	if not event_rows:
761	return 0, n_xi
762
763	inserted_rows = execute_values(
764	cur,
765	"INSERT INTO Event (match_id, team_id, player_id, event_type_id, minute, second) "
766	"VALUES %s RETURNING event_id",
767	event_rows, fetch=True, page_size=2000, # OPT-3
768	)
769	inserted_ids = [r[0] for r in inserted_rows]
770
771	attr_rows = [
772	(eid, name[:50], val)
773	for eid, attrs in zip(inserted_ids, attr_sets)
774	for name, val in attrs.items()
775	]
776	if attr_rows:
777	execute_values(
778	cur,
779	"INSERT INTO Event_attribute (event_id, attribute_name, attribute_value) "
780	"VALUES %s ON CONFLICT DO NOTHING",
781	attr_rows, page_size=2000, # OPT-3
782	)
783
784	return len(inserted_ids), n_xi
785
786
787	# statsbomb - returns {match_id → (home, away) goals} (the football-data odds generator needs it)
788	def run_statsbomb(conn) -> dict[int, tuple[int, int]]:
789	print("\n" + "═" * 70)
790	print("PIPELINE 1 — STATSBOMB")
791	print("═" * 70)
792
793	competitions = load_json(SB_COMPETITIONS)
794	print(f" Found {len(competitions)} competition/season entries")
795
796	cur = conn.cursor()
797
798	# TeamResolver also lives across pipelines: it'll be re-initialised in
799	# the football-data pipeline so it picks up everything we insert here
800	# (including any aliases we flush at the end).
801	team_resolver = TeamResolver(cur)
802	cache = _SBCache(cur, team_resolver)
803
804	print(f" Cache loaded: {len(cache.teams._by_name)} teams, "
805	f"{len(cache.player)} players, {len(cache.event_type)} event types\n")
806
807	total_linked = total_created = total_skipped = 0
808	total_players = total_xi = total_events = 0
809	matches_since_commit = 0
810
811	for comp in competitions:
812	comp_id = comp["competition_id"]
813	season_id = comp["season_id"]
814	matches_file = os.path.join(SB_MATCHES_DIR, str(comp_id), f"{season_id}.json")
815	if not os.path.exists(matches_file):
816	continue
817
818	# load matches
819	matches = load_json(matches_file)
820	print(f"\n -- {comp['competition_name']} / {comp['season_name']} "
821	f"({len(matches)} matches) --")
822
823	for i, sb_match in enumerate(matches, 1):
824	sb_mid = sb_match["match_id"]
825	home_name = sb_match["home_team"]["home_team_name"]
826	away_name = sb_match["away_team"]["away_team_name"]
827	print(f" [{i}/{len(matches)}] {home_name} vs {away_name} "
828	f"({sb_match.get('match_date', '?')})", end=" ... ", flush=True)
829
830	db_mid, was_created, match_season_id, date_str = _sb_find_or_create_match(cur, cache, sb_match, comp)
831	if db_mid is None:
832	print("skipped")
833	total_skipped += 1
834	continue
835	if was_created:
836	total_created += 1
837	print(f"\n [new match] id={db_mid}", end="")
838
839	_sb_enrich_stadium(cur, db_mid, sb_match)
840	# pass the competition's country so newly created Referee rows get
841	# a non-NULL country; comp is already in scope from the outer loop.
842	_sb_enrich_referee(cur, db_mid, sb_match, cache,
843	comp.get("country_name") or None)
844
845	_sb_collect_coach_appearances(db_mid, sb_match, cache,
846	match_season_id, date_str)
847
848	n_players = 0
849	lineup_path = os.path.join(SB_LINEUPS_DIR, f"{sb_mid}.json")
850	if os.path.exists(lineup_path):
851	n_players = _sb_import_lineups(cur, db_mid, lineup_path, cache)
852
853	n_events = n_xi = 0
854	events_path = os.path.join(SB_EVENTS_DIR, f"{sb_mid}.json")
855	if os.path.exists(events_path):
856	n_events, n_xi = _sb_import_events(cur, db_mid, events_path, cache)
857
858	cache.teams.flush_aliases(cur)
859
860	total_linked += 1
861	total_players += n_players
862	total_xi += n_xi
863	total_events += n_events
864	matches_since_commit += 1
865	print(f"done ({n_players} lineup, {n_xi} XI, {n_events} events)")
866
867	if matches_since_commit >= SB_COMMIT_EVERY:
868	conn.commit()
869	matches_since_commit = 0
870
871	# finalize coach assignments
872	print("\n Finalising coach assignments (collapsing match appearances "
873	"into stints)...")
874	_sb_finalize_coach_assignments(cur, cache)
875
876	conn.commit()
877	cur.close()
878
879	print("\n -- StatsBomb Summary --")
880	print(f" Matches linked (pre-existing) : {total_linked - total_created}")
881	print(f" Matches created from SB data : {total_created}")
882	print(f" Matches skipped : {total_skipped}")
883	print(f" Lineup rows (lineup files) : {total_players}")
884	print(f" Lineup rows (Starting XI) : {total_xi}")
885	print(f" Events inserted : {total_events}")
886	print(f" Final scores cached : {len(cache.match_scores)}")
887
888	return cache.match_scores
889
890
891	# football-data
892
893
894	# strip BOMs, NBSPs and weirdness and lowercase the header
895	def _fd_normalize_header(name: str) -> str:
896	name = name.replace("", "").replace("\xa0", " ")
897	return re.sub(r"\s+", "", name.strip().strip('"')).lower()
898
899
900	def _fd_parse_date(value):
901	if value is None or (isinstance(value, float) and pd.isna(value)):
902	return None
903	v = str(value).strip()
904	if not v:
905	return None
906	for fmt in ("%d/%m/%Y", "%d/%m/%y"):
907	try:
908	return pd.to_datetime(v, format=fmt).date()
909	except ValueError:
910	continue
911	return None
912
913
914	# extracts bookmaker names that have a full triple
915	def _fd_bookmaker_prefixes(columns: list[str]) -> list[str]:
916	col_set = set(columns)
917	prefixes = []
918	for col in columns:
919	if re.match(r"^[a-z0-9]+h$", col) and len(col) > 1:
920	prefix = col[:-1]
921	if prefix and (prefix + "d") in col_set and (prefix + "a") in col_set:
922	prefixes.append(prefix)
923	return sorted(set(prefixes))
924
925
926	def _fd_read_csv(path: str) -> pd.DataFrame:
927	raw = None
928	used_encoding = None
929	for enc in ("utf-8-sig", "cp1252"):
930	try:
931	with open(path, encoding=enc, newline="") as f:
932	raw = f.read()
933	used_encoding = enc
934	break
935	except UnicodeDecodeError:
936	continue
937	if raw is None:
938	raise ValueError(f"Could not decode {path} with UTF-8 or WIN1252")
939	print(f" encoding: {used_encoding}")
940	raw = raw.replace("\r", "").replace("\xa0", " ")
941	lines = [l for l in raw.split("\n") if l.strip()]
942	if not lines:
943	return pd.DataFrame()
944	header_fields = next(csv.reader([lines[0]]))
945	num_cols = len(header_fields)
946	rows = []
947	for line in lines[1:]:
948	fields = next(csv.reader([line]))
949	# ad short rows with None / truncate long rows so the resulting
950	if len(fields) < num_cols:
951	fields += [None] * (num_cols - len(fields))
952	elif len(fields) > num_cols:
953	fields = fields[:num_cols]
954	rows.append(fields)
955	return pd.DataFrame(rows, columns=header_fields)
956
957
958	def _fd_clean_str(value) -> str \| None:
959	if value is None:
960	return None
961	s = str(value).strip()
962	return None if s in ("", "nan", "None") else s
963
964
965	# returns dicts for each row
966	def _fd_process_file(path: str) -> list[dict]:
967	df = _fd_read_csv(path)
968	if df.empty:
969	print(" Empty file, skipping.")
970	return []
971
972	df.columns = [_fd_normalize_header(c) for c in df.columns]
973	required = {"date", "hometeam", "awayteam"}
974	if required - set(df.columns):
975	print(f" Missing columns {required - set(df.columns)}, skipping.")
976	return []
977
978	bookmaker_prefixes = _fd_bookmaker_prefixes(list(df.columns))
979	print(f" bookmaker triples found: {len(bookmaker_prefixes)}")
980
981	out: list[dict] = []
982	for _, row in df.iterrows():
983	match_date = _fd_parse_date(row.get("date"))
984	home_team = _fd_clean_str(row.get("hometeam"))
985	away_team = _fd_clean_str(row.get("awayteam"))
986	referee = _fd_clean_str(row.get("referee"))
987
988	if not match_date or not home_team or not away_team or home_team == away_team:
989	continue
990
991	# pull every bookmaker that has a complete H/D/A triple for this row
992	odds = []
993	for prefix in bookmaker_prefixes:
994	try:
995	home_win = float(row.get(prefix + "h") or 0)
996	draw = float(row.get(prefix + "d") or 0)
997	away_win = float(row.get(prefix + "a") or 0)
998	except (ValueError, TypeError):
999	continue
1000	if home_win > 0 and draw > 0 and away_win > 0:
1001	odds.append({
1002	"bookmaker": prefix.upper(),
1003	"home_win": home_win,
1004	"draw": draw,
1005	"away_win": away_win,
1006	})
1007
1008	out.append({
1009	"home": home_team,
1010	"away": away_team,
1011	"date": match_date,
1012	"referee": referee,
1013	"odds": odds,
1014	})
1015
1016	return out
1017
1018
1019	# generate one full pre-match to full-time series of synthetic odds
1020	def _fd_generate_timeseries_odds(match_id: int,
1021	match_date: date,
1022	baseline: tuple[float, float, float],
1023	score: tuple[int, int]) -> list[tuple]:
1024	# build synthetic odds snapshots at 0,10,...,90 minutes from baseline and final score
1025	p_home, p_draw, p_away = baseline
1026	home_goals, away_goals = score
1027	rows = []
1028
1029	base_ts = datetime.combine(match_date, time(0, 0, 0))
1030
1031	for minute in range(0, 91, 10):
1032	t_w = minute / 90 # 0 at kick-off, 1 at full time
1033	h_so_far = math.floor(home_goals * t_w)
1034	a_so_far = math.floor(away_goals * t_w)
1035	diff = h_so_far - a_so_far # +ve when home leads
1036
1037	# Home win prob raises when home is leading; effect grows with time.
1038	adj_h = max(0.02, min(0.97, p_home + diff * 0.08 * t_w))
1039	# Draw prob ↓ as \|goal_diff\| grows and as time runs out.
1040	adj_d = max(0.02, min(0.97, p_draw - abs(diff) * 0.06 * t_w))
1041	# Away win prob mirrors home in reverse.
1042	adj_a = max(0.02, min(0.97, p_away + (-diff) * 0.08 * t_w))
1043
1044	total = adj_h + adj_d + adj_a
1045	# Renormalise (so the three probs again sum to 1) then convert to decimal odds.
1046	home_odds = max(1.01, round(1.0 / (adj_h / total) * 0.95, 2))
1047	draw_odds = max(1.01, round(1.0 / (adj_d / total) * 0.95, 2))
1048	away_odds = max(1.01, round(1.0 / (adj_a / total) * 0.95, 2))
1049
1050	rt = base_ts + timedelta(minutes=minute)
1051	rows.append((match_id, "Generated", home_odds, draw_odds, away_odds, rt))
1052
1053	return rows
1054
1055
1056	# import football-data odds and referees then generate synthetic odds series
1057	def run_football_data_odds(conn, match_scores: dict[int, tuple[int, int]]):
1058	print("\n" + "═" * 70)
1059	print("PIPELINE 2 — FOOTBALL-DATA (odds + referees only)")
1060	print("═" * 70)
1061
1062	csv_files = sorted(f for f in os.listdir(FD_FOLDER) if f.lower().endswith(".csv"))
1063	if not csv_files:
1064	print(f" No CSV files found in {FD_FOLDER}, skipping.")
1065	return
1066
1067	cur = conn.cursor()
1068
1069	team_resolver = TeamResolver(cur, manual_aliases=FD_TEAM_ALIASES)
1070
1071	# build the (home_id, away_id, match_date) to match_id map once.
1072	cur.execute('SELECT match_id, home_team_id, away_team_id, match_date FROM "match"')
1073	match_lookup: dict[tuple[int, int, date], int] = {
1074	(h, a, d): m for m, h, a, d in cur.fetchall()
1075	}
1076	print(f" Loaded {len(match_lookup)} existing matches for lookup.")
1077
1078	# referee caches so we don't query the DB once per row.
1079	cur.execute("SELECT referee_id, name FROM Referee")
1080	ref_cache: dict[str, int] = {n: i for i, n in cur.fetchall()}
1081	cur.execute("SELECT referee_id, match_id FROM Referee_match")
1082	ref_match_pairs: set[tuple[int, int]] = {(r, m) for r, m in cur.fetchall()}
1083
1084	# for referee's country we choose home team's country
1085	cur.execute("SELECT team_id, country FROM Team")
1086	team_country_map: dict[int, str \| None] = {tid: ctry for tid, ctry in cur.fetchall()}
1087
1088	real_odds_rows: list[tuple] = []
1089	referee_rows: list[tuple] = []
1090	matched_rows = unmatched_rows = 0
1091	matched_match_ids: set[int] = set()
1092
1093	print(f" Processing {len(csv_files)} CSV file(s)...")
1094	for filename in csv_files:
1095	path = os.path.join(FD_FOLDER, filename)
1096	print(f" Reading: {filename}")
1097	for r in _fd_process_file(path):
1098	home_id = team_resolver.resolve(r["home"])
1099	away_id = team_resolver.resolve(r["away"])
1100	if not home_id or not away_id:
1101	unmatched_rows += 1
1102	continue
1103	mid = match_lookup.get((home_id, away_id, r["date"]))
1104	if mid is None:
1105	unmatched_rows += 1
1106	continue
1107
1108	matched_rows += 1
1109	matched_match_ids.add(mid)
1110
1111	# one Match_odds row per bookmaker.
1112	kickoff_ts = datetime.combine(r["date"], time(12, 0, 0))
1113	for o in r["odds"]:
1114	real_odds_rows.append((
1115	mid, o["bookmaker"], o["home_win"], o["draw"], o["away_win"],
1116	kickoff_ts,
1117	))
1118
1119	# referee linking: ensure the row exists in Referee then attach
1120	# it to this match if not already attached.
1121	if r["referee"]:
1122	ref_id = ref_cache.get(r["referee"])
1123	if ref_id is None:
1124	# use the home team's country as the referee's country
1125	# (fall back to the away team if home has no country set).
1126	ref_country = (team_country_map.get(home_id)
1127	or team_country_map.get(away_id))
1128	cur.execute(
1129	"INSERT INTO Referee (name, country, date_of_birth) "
1130	"VALUES (%s,%s,%s) RETURNING referee_id",
1131	(r["referee"], ref_country, _random_referee_dob())
1132	)
1133	ref_id = cur.fetchone()[0]
1134	ref_cache[r["referee"]] = ref_id
1135	if (ref_id, mid) not in ref_match_pairs:
1136	referee_rows.append((ref_id, mid, "main"))
1137	ref_match_pairs.add((ref_id, mid))
1138
1139	print(f"\n Matched FD rows to SB matches : {matched_rows}")
1140	print(f" Unmatched (no team or no SB match) : {unmatched_rows}")
1141
1142	cur.execute("DELETE FROM Match_odds WHERE bookmaker = 'Generated'")
1143
1144	# real bookmaker rows
1145	if real_odds_rows:
1146	execute_values(
1147	cur,
1148	"""INSERT INTO Match_odds (match_id, bookmaker, home_win, draw, away_win, recorded_time)
1149	VALUES %s
1150	ON CONFLICT (match_id, bookmaker, recorded_time) DO UPDATE
1151	SET home_win=EXCLUDED.home_win,
1152	draw=EXCLUDED.draw,
1153	away_win=EXCLUDED.away_win""",
1154	real_odds_rows, page_size=2000,
1155	)
1156
1157	if referee_rows:
1158	execute_values(
1159	cur,
1160	"INSERT INTO Referee_match (referee_id, match_id, role) VALUES %s "
1161	"ON CONFLICT DO NOTHING",
1162	referee_rows, page_size=2000,
1163	)
1164
1165	conn.commit()
1166
1167	# generated time-series
1168	# for every match that now has at least one real bookmaker row AND a
1169	# cached final score from StatsBomb, derive the kick-off implied
1170	# probability (averaged across bookmakers) and synthesise the 0..90 min
1171	# series.
1172	if not matched_match_ids:
1173	print(" No matched matches → no time-series odds to generate.")
1174	cur.close()
1175	return
1176
1177	# pull per-match baseline implied probabilities + match_date in one query.
1178	cur.execute(
1179	"""
1180	SELECT mo.match_id,
1181	AVG(1.0 / mo.home_win) AS p_home,
1182	AVG(1.0 / mo.draw) AS p_draw,
1183	AVG(1.0 / mo.away_win) AS p_away,
1184	m.match_date
1185	FROM Match_odds mo
1186	JOIN "match" m ON m.match_id = mo.match_id
1187	WHERE mo.bookmaker <> 'Generated'
1188	AND mo.match_id = ANY(%s)
1189	GROUP BY mo.match_id, m.match_date
1190	""",
1191	(list(matched_match_ids),)
1192	)
1193	baselines = cur.fetchall()
1194
1195	generated: list[tuple] = []
1196	skipped_no_score = 0
1197	for mid, p_h, p_d, p_a, mdate in baselines:
1198	score = match_scores.get(mid)
1199	if score is None:
1200	skipped_no_score += 1
1201	continue
1202	generated.extend(_fd_generate_timeseries_odds(
1203	mid, mdate, (float(p_h), float(p_d), float(p_a)), score
1204	))
1205
1206	if generated:
1207	execute_values(
1208	cur,
1209	"""INSERT INTO Match_odds (match_id, bookmaker, home_win, draw, away_win, recorded_time)
1210	VALUES %s
1211	ON CONFLICT (match_id, bookmaker, recorded_time) DO NOTHING""",
1212	generated, page_size=2000, # OPT-3
1213	)
1214
1215	conn.commit()
1216	cur.close()
1217
1218	print(f" Real bookmaker rows inserted : {len(real_odds_rows)}")
1219	print(f" Referee links inserted : {len(referee_rows)}")
1220	print(f" Generated time-series rows : {len(generated)} "
1221	f"({len(generated) // 10 if generated else 0} matches × 10 snapshots)")
1222	if skipped_no_score:
1223	print(f" Skipped (no SB final score) : {skipped_no_score}")
1224
1225
1226	# transfermarkt
1227
1228	# resolves transfermarkt player name to DB player_id
1229	# order: exact name -> word match (split on space/hyphen, skip ≤ 2 chars) -> fuzzy
1230	class _PlayerResolver:
1231	def __init__(self, cur):
1232	self._by_name: dict[str, int] = {}
1233	self._cache: dict[str, int \| None] = {}
1234	self._fuzzy_list: list[tuple[str, int]] = []
1235
1236	cur.execute("SELECT player_id, name FROM Player")
1237	for pid, name in cur.fetchall():
1238	self._by_name[name.lower()] = pid
1239	self._fuzzy_list.append((name, pid))
1240
1241	print(f" PlayerResolver: {len(self._by_name)} players loaded from DB")
1242
1243	def resolve(self, tm_name: str) -> int \| None:
1244	if not tm_name:
1245	return None
1246	if tm_name in self._cache:
1247	return self._cache[tm_name]
1248	key = tm_name.lower().strip()
1249
1250	pid = self._by_name.get(key)
1251	if pid:
1252	self._cache[tm_name] = pid
1253	return pid
1254
1255	for word in re.split(r"[\s\-]+", tm_name):
1256	if len(word) <= 2:
1257	continue
1258	pid = self._by_name.get(word.lower())
1259	if pid:
1260	print(f" [word] '{tm_name}' → '{word}' (id={pid})")
1261	self._cache[tm_name] = pid
1262	return pid
1263
1264	if self._fuzzy_list:
1265	result = fuzz_process.extractOne(
1266	tm_name,
1267	[n for n, _ in self._fuzzy_list],
1268	scorer=fuzz.token_sort_ratio
1269	)
1270	if result and result[1] >= PLAYER_FUZZY_THRESHOLD:
1271	best_name, best_score = result[0], result[1]
1272	pid = next(p for n, p in self._fuzzy_list if n == best_name)
1273	print(f" [fuzzy] '{tm_name}' → '{best_name}' (score={best_score})")
1274	self._cache[tm_name] = pid
1275	return pid
1276
1277	self._cache[tm_name] = None
1278	return None
1279
1280
1281	# map messy transfer_fee text into canonical transfer type labels◘
1282
1283
1284	def _tm_classify_transfer_type(row) -> str:
1285	fee = row.get("transfer_fee", None)
1286	fee_str = str(fee).strip().lower() if pd.notna(fee) else ""
1287	if fee_str in ("", "nan", "none", "-", "?"):
1288	return "Unknown"
1289	if fee_str in ("0", "0.0"):
1290	return "Free Transfer"
1291	if "loan" in fee_str and "end" in fee_str:
1292	return "End of Loan"
1293	if "loan" in fee_str:
1294	return "Loan"
1295	return "Permanent"
1296
1297
1298	def _tm_get_or_create_transfer_type(cur, cache: dict[str, int], type_name: str) -> int:
1299	if type_name in cache:
1300	return cache[type_name]
1301	cur.execute("SELECT transfer_type_id FROM Transfer_type WHERE type=%s LIMIT 1", (type_name,))
1302	row = cur.fetchone()
1303	if row:
1304	cache[type_name] = row[0]
1305	else:
1306	cur.execute(
1307	"INSERT INTO Transfer_type (type) VALUES (%s) RETURNING transfer_type_id",
1308	(type_name,)
1309	)
1310	cache[type_name] = cur.fetchone()[0]
1311	return cache[type_name]
1312
1313
1314	def _tm_parse_fee(raw) -> float \| None:
1315	if raw is None or (isinstance(raw, float) and pd.isna(raw)):
1316	return None
1317	try:
1318	return max(float(raw), 0.0)
1319	except (ValueError, TypeError):
1320	return None
1321
1322
1323	_season_resolve_cache: dict[str, int \| None] = {}
1324
1325
1326	def _tm_resolve_season(cur, season_str: str) -> int \| None:
1327	if season_str in _season_resolve_cache:
1328	return _season_resolve_cache[season_str]
1329	candidates = [season_str]
1330	if re.match(r"^\d{4}$", season_str):
1331	y = int(season_str)
1332	candidates += [f"{y - 1}/{str(y)[2:]}", f"{y}/{y + 1}", f"{str(y)[2:]}/{str(y + 1)[2:]}"]
1333	m = re.match(r"^(\d{4})/(\d{4})$", season_str)
1334	if m:
1335	y1, y2 = m.group(1), m.group(2)
1336	candidates += [f"{y1}/{y2[2:]}", f"{y1[2:]}/{y2[2:]}"]
1337	for c in candidates:
1338	cur.execute("SELECT season_id FROM Season WHERE season_name=%s LIMIT 1", (c,))
1339	row = cur.fetchone()
1340	if row:
1341	_season_resolve_cache[season_str] = row[0]
1342	return row[0]
1343	_season_resolve_cache[season_str] = None
1344	return None
1345
1346
1347	def _tm_build_player_map(players_df: pd.DataFrame, resolver: _PlayerResolver
1348	) -> dict[int, int]:
1349	mapping: dict[int, int] = {}
1350	matched = 0
1351	for _, row in players_df.iterrows():
1352	tm_id = int(row["player_id"])
1353	tm_name = str(row.get("name", "")).strip()
1354	db_id = resolver.resolve(tm_name)
1355	if db_id:
1356	mapping[tm_id] = db_id
1357	matched += 1
1358	print(f" Player mapping: {matched}/{len(players_df)} TM players matched")
1359	return mapping
1360
1361
1362	def _tm_import_valuations(cur, df: pd.DataFrame, player_map: dict[int, int]) -> int:
1363	rows = []
1364	skipped_player = skipped_season = skipped_fee = 0
1365	for _, row in df.iterrows():
1366	db_pid = player_map.get(int(row["player_id"]))
1367	if not db_pid:
1368	skipped_player += 1
1369	continue
1370	raw_date = row.get("date")
1371	if pd.isna(raw_date) or not raw_date:
1372	skipped_season += 1
1373	continue
1374	val_date = pd.to_datetime(raw_date, errors="coerce")
1375	if pd.isna(val_date):
1376	skipped_season += 1
1377	continue
1378	mv = _tm_parse_fee(row.get("market_value_in_eur"))
1379	if mv is None:
1380	skipped_fee += 1
1381	continue
1382	year = val_date.year
1383	season_id = _tm_resolve_season(cur, f"{year - 1}/{str(year)[2:]}") or \
1384	_tm_resolve_season(cur, str(year))
1385	if not season_id:
1386	skipped_season += 1
1387	continue
1388	rows.append((db_pid, season_id, mv, val_date.date()))
1389
1390	if rows:
1391	execute_values(
1392	cur,
1393	"INSERT INTO Player_valuation (player_id, season_id, market_value, valuation_date) "
1394	"VALUES %s ON CONFLICT DO NOTHING",
1395	rows, page_size=2000, # OPT-3
1396	)
1397	print(f" Valuations: {len(rows)} inserted \| "
1398	f"{skipped_player} no player \| {skipped_season} no season \| {skipped_fee} no value")
1399	return len(rows)
1400
1401
1402	def _tm_import_transfers(cur, df: pd.DataFrame, player_map: dict[int, int],
1403	team_resolver: TeamResolver) -> int:
1404	type_cache: dict[str, int] = {}
1405	rows = []
1406	skipped_player = skipped_team = skipped_date = skipped_same = 0
1407
1408	for _, row in df.iterrows():
1409	db_pid = player_map.get(int(row["player_id"]))
1410	if not db_pid:
1411	skipped_player += 1
1412	continue
1413	raw_date = row.get("transfer_date")
1414	if pd.isna(raw_date) or not raw_date:
1415	skipped_date += 1
1416	continue
1417	transfer_date = pd.to_datetime(raw_date, errors="coerce")
1418	if pd.isna(transfer_date):
1419	skipped_date += 1
1420	continue
1421
1422	from_id = team_resolver.resolve(str(row.get("from_club_name", "") or "").strip())
1423	to_id = team_resolver.resolve(str(row.get("to_club_name", "") or "").strip())
1424	if not from_id or not to_id:
1425	skipped_team += 1
1426	continue
1427	if from_id == to_id:
1428	skipped_same += 1
1429	continue
1430
1431	fee = _tm_parse_fee(row.get("transfer_fee")) or 0.0
1432	type_id = _tm_get_or_create_transfer_type(
1433	cur, type_cache, _tm_classify_transfer_type(row)
1434	)
1435	rows.append((db_pid, from_id, to_id, transfer_date.date(), fee, type_id))
1436
1437	if rows:
1438	execute_values(
1439	cur,
1440	"INSERT INTO Transfer "
1441	"(player_id, from_team_id, to_team_id, transfer_date, fee, transfer_type_id) "
1442	"VALUES %s ON CONFLICT DO NOTHING",
1443	rows, page_size=2000, # OPT-3
1444	)
1445	print(f" Transfers: {len(rows)} inserted \| "
1446	f"{skipped_player} no player \| {skipped_team} no team \| "
1447	f"{skipped_date} bad date \| {skipped_same} same from/to")
1448	return len(rows)
1449
1450
1451	def run_transfermarkt(conn):
1452	print("\n" + "═" * 70)
1453	print("PIPELINE 3 — TRANSFERMARKT")
1454	print("═" * 70)
1455
1456	print(" Loading CSVs...")
1457	players_df = pd.read_csv(TM_PLAYERS, low_memory=False)
1458	valuations_df = pd.read_csv(TM_VALUATIONS, low_memory=False)
1459	transfers_df = pd.read_csv(TM_TRANSFERS, low_memory=False)
1460	print(f" players: {len(players_df):,} \| "
1461	f"valuations: {len(valuations_df):,} \| transfers: {len(transfers_df):,}")
1462
1463	cur = conn.cursor()
1464	resolver = _PlayerResolver(cur)
1465	team_resolver = TeamResolver(cur)
1466	player_map = _tm_build_player_map(players_df, resolver)
1467
1468	print(" Importing valuations...")
1469	_tm_import_valuations(cur, valuations_df, player_map)
1470	conn.commit()
1471
1472	print(" Importing transfers...")
1473	_tm_import_transfers(cur, transfers_df, player_map, team_resolver)
1474	conn.commit()
1475
1476	cur.close()
1477
1478
1479	def main():
1480	print("PitchMap Master Importer")
1481	print("Pipelines: statsbomb → football-data (odds) → transfermarkt\n")
1482
1483	conn = psycopg2.connect(**DB_CONFIG)
1484	# disable synchronous_commit for the session so Postgres doesn't fsync after every commit
1485	cur = conn.cursor()
1486	cur.execute("SET synchronous_commit = off")
1487	conn.commit()
1488	cur.close()
1489	try:
1490	match_scores = run_statsbomb(conn)
1491
1492	run_football_data_odds(conn, match_scores)
1493
1494	run_transfermarkt(conn)
1495	finally:
1496	conn.close()
1497
1498	print("\n" + "═" * 70)
1499	print("All pipelines complete.")
1500	print("═" * 70)
1501
1502
1503	if __name__ == "__main__":
1504	main()

Download in other formats:

Original Format