Source code for mplsoccer.statsbomb

"""`mplsoccer.statsbomb` is a python module for loading StatsBomb open, local and API data."""

import json
import os

import pandas as pd
import requests

__all__ = ['Sbopen', 'Sbapi', 'Sblocal']


[docs] class Sbopen: """ Class for loading data from the StatsBomb open-data. The data is available at: https://github.com/statsbomb/open-data under a non-commercial license. Parameters ---------- dataframe : bool, default True Whether to return dataframes (True) or flattened list of dictionaries (False) from the class methods. """ def __init__(self, dataframe=True): self.dataframe = dataframe self.url = 'https://raw.githubusercontent.com/statsbomb/open-data/master/data/' @staticmethod def _get_data(url): """ Get the StatsBomb data as a list of dictionaries. Parameters ---------- url : str Returns ------- json-encoded content of a request's response For the StatsBomb data this is typically a list of dictionaries. """ resp = requests.get(url=url) resp.raise_for_status() return resp.json()
[docs] def event(self, match_id): """ StatsBomb event open-data. Parameters ---------- match_id : int Returns ------- events, related, freeze, tactics Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbopen >>> parser = Sbopen(dataframe=True) >>> events, related, freeze, tactics = parser.event(3788741) """ url = f'{self.url}events/{match_id}.json' data = self._get_data(url) return flatten_event(data, match_id, self.dataframe)
[docs] def lineup(self, match_id): """ StatsBomb lineup open-data. Parameters ---------- match_id : int Returns ------- lineups A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbopen >>> parser = Sbopen(dataframe=True) >>> lineups = parser.lineup(3788741) """ url = f'{self.url}lineups/{match_id}.json' data = self._get_data(url) return flatten_lineup(data, match_id, self.dataframe)
[docs] def match(self, competition_id, season_id): """ StatsBomb match open-data. Parameters ---------- competition_id : int season_id : int Returns ------- matches A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbopen >>> parser = Sbopen(dataframe=True) >>> matches = parser.match(11, 1) """ url = f'{self.url}matches/{competition_id}/{season_id}.json' data = self._get_data(url) return flatten_match(data, self.dataframe)
[docs] def competition(self): """ StatsBomb competition open-data. Returns ------- competition A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbopen >>> parser = Sbopen(dataframe=True) >>> competition = parser.competition() """ url = f'{self.url}competitions.json' data = self._get_data(url) return pd.DataFrame(data) if self.dataframe else data
[docs] def frame(self, match_id): """ StatsBomb 360 open-data. Parameters ---------- match_id : int Returns ------- frames, visible Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbopen >>> parser = Sbopen(dataframe=True) >>> frames, visible = parser.frame(3788741) """ url = f'{self.url}three-sixty/{match_id}.json' data = self._get_data(url) return flatten_360(data, match_id, self.dataframe)
[docs] class Sbapi: """ Class for loading data from the StatsBomb API. You can either set the SB_USERNAME and SB_PASSWORD environmental variables or use the username and password arguments. Parameters ---------- username : str, default None Username for accessing StatsBomb API. If None then uses the SB_USERNAME environmental variable. password : str, default None Password for accessing the StatsBomb API. If None then uses the SB_PASSWORD environmental variable. dataframe : bool, default True Whether to return dataframes (True) or flattened list of dictionaries (False) from the class methods. """ def __init__(self, username=None, password=None, dataframe=True): if username is None: username = os.environ.get("SB_USERNAME") if password is None: password = os.environ.get("SB_PASSWORD") self.auth = requests.auth.HTTPBasicAuth(username, password) self.dataframe = dataframe self.url = 'https://data.statsbombservices.com/api/v' def _get_data(self, url): """ Get the StatsBomb data as a list of dictionaries. Parameters ---------- url : str Returns ------- json-encoded content of a request's response For the StatsBomb data this is typically a list of dictionaries. """ resp = requests.get(url=url, auth=self.auth) resp.raise_for_status() return resp.json()
[docs] def event(self, match_id, version=6): """ StatsBomb event data from the API. Parameters ---------- match_id : int version : int, default 6 Returns ------- events, related, freeze, tactics Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbapi >>> parser = Sbapi(dataframe=True) >>> events, related, freeze, tactics = parser.event(3788741) """ url = f'{self.url}{version}/events/{match_id}' data = self._get_data(url) return flatten_event(data, match_id, self.dataframe)
[docs] def lineup(self, match_id, version=2): """ StatsBomb lineup data from the API. Parameters ---------- match_id : int version : int, default 2 Returns ------- lineups A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbapi >>> parser = Sbapi(dataframe=True) >>> lineups = parser.lineup(3788741) """ url = f'{self.url}{version}/lineups/{match_id}' data = self._get_data(url) return flatten_lineup(data, match_id, self.dataframe)
[docs] def match(self, competition_id, season_id, version=5): """ StatsBomb match data from the API. Parameters ---------- competition_id : int season_id : int version : int Returns ------- matches A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbapi >>> parser = Sbapi(dataframe=True) >>> matches = parser.match(11, 1) """ url = f'{self.url}{version}/competitions/{competition_id}/seasons/{season_id}/matches' data = self._get_data(url) return flatten_match(data, self.dataframe)
[docs] def competition(self, version=4): """ StatsBomb competition from the API. Parameters ---------- version : int, default 4 Returns ------- competition A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbapi >>> parser = Sbapi(dataframe=True) >>> competition = parser.competition() """ url = f'{self.url}{version}/competitions' data = self._get_data(url) return pd.DataFrame(data) if self.dataframe else data
[docs] def frame(self, match_id, version=1): """ StatsBomb 360 data from the API. Parameters ---------- match_id : int version : int, default 1 Returns ------- frames, visible Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sbapi >>> parser = Sbapi(dataframe=True) >>> frames, visible = parser.frame(3788741) """ url = f'{self.url}{version}/360-frames/{match_id}' data = self._get_data(url) return flatten_360(data, match_id, self.dataframe)
[docs] class Sblocal: """ Class for loading local StatsBomb data. Parameters ---------- dataframe : bool, default True Whether to return dataframes (True) or flattened list of dictionaries (False) from the class methods. """ def __init__(self, dataframe=True): self.dataframe = dataframe @staticmethod def _get_data(path): """ Read the StatsBomb data. Parameters ---------- path : path to file Returns ------- For the StatsBomb data this typically returns a list of dictionaries. """ with open(path, encoding='utf-8') as file: data = json.load(file) return data
[docs] def event(self, path): """ Read the event data from a local file. Parameters ---------- path : path to file Returns ------- events, related, freeze, tactics Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sblocal >>> parser = Sblocal(dataframe=True) >>> events, related, freeze, tactics = parser.event(path) """ data = self._get_data(path) match_id = int(os.path.basename(path)[:-5]) return flatten_event(data, match_id, self.dataframe)
[docs] def lineup(self, path): """ Read the lineup data from a local file. Parameters ---------- path : path to file Returns ------- lineups A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sblocal >>> parser = Sblocal(dataframe=True) >>> lineups = parser.lineup(path) """ data = self._get_data(path) match_id = int(os.path.basename(path)[:-5]) return flatten_lineup(data, match_id, self.dataframe)
[docs] def match(self, path): """ Read the match data from a local file. Parameters ---------- path : path to file Returns ------- matches A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sblocal >>> parser = Sblocal(dataframe=True) >>> matches = parser.match(path) """ data = self._get_data(path) return flatten_match(data, self.dataframe)
[docs] def competition(self, path): """ Read the competition data from a local file. Parameters ---------- path : path to file Returns ------- competition A dataframe or a flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sblocal >>> parser = Sblocal(dataframe=True) >>> competition = parser.competition(path) """ data = self._get_data(path) return pd.DataFrame(data) if self.dataframe else data
[docs] def frame(self, path): """ Read the 360 data from a local file. Parameters ---------- path : path to file Returns ------- frames, visible Either dataframes or flattened list of dictionaries. Examples -------- >>> from mplsoccer import Sblocal >>> parser = Sblocal(dataframe=True) >>> frames, visible = parser.frame(path) """ data = self._get_data(path) match_id = int(os.path.basename(path)[:-5]) return flatten_360(data, match_id, self.dataframe)
def _flatten_location(row, value, keyword=''): """ Flatten a list of locations into dictionary keys (x, y, z).""" if len(value) == 2: row[f'{keyword}x'], row[f'{keyword}y'] = value elif len(value) == 3: row[f'{keyword}x'], row[f'{keyword}y'], row[f'{keyword}z'] = value else: msg = 'location length not equal to 2 (x, y) or 3 (x, y, z)' raise AssertionError(msg) def _flatten_freeze(data, match_id, event_id): """ Flatten the freeze-frame events.""" for row in data: row['match_id'] = match_id row['id'] = event_id for key in list(row): value = row[key] if key == 'location': _flatten_location(row, value) del row['location'] elif key in ['player', 'position']: for nested_key in value: row[f'{key}_{nested_key}'] = value[nested_key] del row[key] return data def _flatten_tactic(data, match_id, event_id): """ Flatten the tactics events.""" for row in data: row['match_id'] = match_id row['id'] = event_id for key in list(row): if key in ['player', 'position']: value = row[key] for nested_key in value: row[f'{key}_{nested_key}'] = value[nested_key] del row[key] return data def _flatten_list_of_lists(list_of_lists, key): """ Flatten a list of lists into a list""" flat_list = [] for sublist in list_of_lists: for idx, item in enumerate(sublist): item[key] = idx + 1 flat_list.append(item) return flat_list def _event_dataframe(data): """ Transform the event dictionary into a dataframe.""" df = pd.DataFrame(data) if df.empty: return None # tactics_formation from float to string mask = df['tactics_formation'].notnull() tactics = df.loc[mask, 'tactics_formation'].astype(int).astype(str) df['tactics_formation'] = df['tactics_formation'].astype(str) df.loc[mask, 'tactics_formation'] = tactics df.loc[~mask, 'tactics_formation'] = None df['timestamp'] = pd.to_datetime(df['timestamp']).dt.time df.sort_values(['period', 'timestamp', 'index'], inplace=True) df.reset_index(drop=True, inplace=True) for col in ['counterpress', 'under_pressure', 'off_camera', 'out']: if col in df.columns: df[col] = df[col].astype(float) return df def _related_dataframe(data, df_events): """ Transform the related-events dictionary into a dataframe. For carries, we also ensure that both the carry and the related event are related both ways. Sometimes another event is not related to the carry event (but it is the other way round)""" df = pd.DataFrame(data) if df.empty: return None cols = ['id', 'index', 'type_name'] df = df.merge(df_events[cols].rename({'id': 'id_related'}, axis='columns'), how='left', on='id_related', validate='m:1', suffixes=('', '_related')) df_carry = df[df['type_name'] == 'Carry'].copy() df_carry.rename({'id': 'id_related', 'index': 'index_related', 'type_name': 'type_name_related', 'id_related': 'id', 'index_related': 'index', 'type_name_related': 'type_name'}, axis='columns', inplace=True) df = pd.concat([df, df_carry]).drop_duplicates() return df def _competition_dataframe(data): """ Format the competition data as a dataframe.""" df = pd.DataFrame(data) date_cols = ['match_updated', 'match_updated_360', 'match_available_360', 'match_available'] for date in date_cols: if date in df.columns: df[date] = pd.to_datetime(df[date]) return df def _match_dataframe(data): """ Format the match data as a dataframe.""" df = pd.DataFrame(data) if df.empty: return None df['kick_off'] = pd.to_datetime(df['match_date'] + ' ' + df['kick_off']) date_cols = ['match_date', 'last_updated', 'last_updated_360', 'home_team_managers_dob', 'away_team_managers_dob'] for date in date_cols: if date in df.columns: if pd.__version__ < '2': df[date] = pd.to_datetime(df[date]) else: df[date] = pd.to_datetime(df[date], format='ISO8601') return df def flatten_event(events, match_id, dataframe=True): """ Flatten the events (list) so each row (dictionary) contains no nested events. Parameters ---------- events : list of dicts The events to flatten. match_id : int The StatsBomb match identifier. dataframe : bool, default True Whether to return the results as a dataframe (True) or as flattened lists of dictionaries (False) Returns ------- events, related, freeze, tactics If dataframe=True then returns dataframes else if dataframe=False each of the returned values is a list of dictionaries. """ related = [] freeze = [] tactics = [] cols_to_drop = ['pass_through_ball', 'pass_outswinging', 'pass_inswinging', 'clearance_head', 'clearance_left_foot', 'clearance_right_foot', 'pass_straight', 'clearance_other', 'goalkeeper_punched_out', 'goalkeeper_shot_saved_off_target', 'shot_saved_off_target', 'goalkeeper_shot_saved_to_post', 'shot_saved_to_post', 'goalkeeper_lost_out', 'goalkeeper_lost_in_play', 'goalkeeper_success_out', 'goalkeeper_success_in_play', 'goalkeeper_saved_to_post', 'shot_kick_off', 'goalkeeper_penalty_saved_to_post'] for row in events: row['match_id'] = match_id for key in list(row): if isinstance(row[key], dict): for nested_key in list(row[key]): nested_value = row[key][nested_key] if nested_key == 'end_location': _flatten_location(row, nested_value, keyword='end_') elif nested_key == 'aerial_won': row[f'{nested_key}'] = nested_value elif nested_key in ['outcome', 'body_part', 'technique', 'aerial_won']: for k in nested_value: row[f'{nested_key}_{k}'] = nested_value[k] elif nested_key == 'freeze_frame': freeze.append(_flatten_freeze(nested_value, match_id, row['id'])) elif nested_key == 'lineup': tactics.append(_flatten_tactic(nested_value, match_id, row['id'])) elif nested_key == 'type': for k in nested_value: row[f'sub_{nested_key}_{k}'] = nested_value[k] elif isinstance(nested_value, dict): for k in nested_value: row[f'{key}_{nested_key}_{k}'] = nested_value[k] else: row[f'{key}_{nested_key}'] = nested_value del row[key] if 'location' in row: _flatten_location(row, row['location']) del row['location'] row['type_name'] = row['type_name'].replace('Ball Receipt*', 'Ball Receipt') # pass through ball is deprecated now, but it was not always added to technique name if 'pass_through_ball' in row: row['technique_name'] = 'Through Ball' for col in cols_to_drop: row.pop(col, None) if 'related_events' in row: related.extend({'match_id': match_id, 'id': row['id'], 'index': row['index'], 'type_name': row['type_name'], 'id_related': related_event} for related_event in row['related_events']) del row['related_events'] tactics = _flatten_list_of_lists(tactics, key='event_tactics_id') freeze = _flatten_list_of_lists(freeze, key='event_freeze_id') if dataframe: events = _event_dataframe(events) related = _related_dataframe(related, events) freeze = pd.DataFrame(freeze) tactics = pd.DataFrame(tactics) return events, related, freeze, tactics def flatten_lineup(data, match_id, dataframe=True): """ Flatten the lineup (list) so each row (dictionary) contains no nested events. Parameters ---------- data : list of dicts The lineup to flatten. match_id : int The StatsBomb match identifier. dataframe : bool, default True Whether to return the results as a dataframe (True) or as flattened lists of dictionaries (False) Returns ------- lineups If dataframe=True then returns a dataframe else if dataframe=False returns a list of dictionaries. """ lineup = [] for row in data: for player in row['lineup']: player['match_id'] = match_id player['team_id'] = row['team_id'] player['team_name'] = row['team_name'] if 'country' in player: player['country_id'] = player['country']['id'] player['country_name'] = player['country']['name'] del player['country'] if 'player_nickname' in player and player['player_nickname'] is None: player['player_nickname'] = player['player_name'] player.pop('positions', None) # if flattened would be multiple lines player.pop('cards', None) # if flattened would be multiple lines lineup.append(player) if dataframe: lineup = pd.DataFrame(lineup) return lineup def flatten_match(match, dataframe=True): """ Flatten the match (list) so each row (dictionary) contains no nested events. Parameters ---------- match : list of dicts The match to flatten. dataframe : bool, default True Whether to return the results as a dataframe (True) or as flattened lists of dictionaries (False) Returns ------- matches If dataframe=True then returns a dataframe else if dataframe=False returns a list of dictionaries. """ for row in match: for key in list(row): value = row[key] if isinstance(value, dict): for nested_key in list(value): nested_value = value[nested_key] if isinstance(nested_value, list): nested_value = nested_value[0] if isinstance(nested_value, dict): for k in list(nested_value): if k == 'nickname' and not nested_value[k]: row[f'{key}_{nested_key}_{k}'] = nested_value['name'] elif isinstance(nested_value[k], dict): for sub_k in nested_value[k]: nested_sub_value = nested_value[k][sub_k] row[f'{key}_{nested_key}_{k}_{sub_k}'] = nested_sub_value else: row[f'{key}_{nested_key}_{k}'] = nested_value[k] elif key in ['competition_stage', 'stadium', 'referee', 'metadata']: row[f'{key}_{nested_key}'] = nested_value else: row[nested_key] = nested_value del row[key] if dataframe: match = _match_dataframe(match) return match def flatten_360(data, match_id, dataframe=True): """ Flatten the 360 data (list) so each row (dictionary) contains no nested events. Parameters ---------- data : list of dicts The 360 data to flatten. match_id : int The StatsBomb match identifier. dataframe : bool, default True Whether to return the results as a dataframe (True) or as flattened lists of dictionaries (False) Returns ------- frames, visible If dataframe=True then returns dataframes else if dataframe=False each of the returned values is a list of dictionaries. """ frames = [] visible = [] for row in data: for frame in row['freeze_frame']: frame['match_id'] = match_id frame['id'] = row['event_uuid'] _flatten_location(frame, frame['location']) del frame['location'] frames.append(frame) frame_visible = {'match_id': match_id, 'id': row['event_uuid'], 'visible_area': row['visible_area'], } visible.append(frame_visible) if dataframe: frames = pd.DataFrame(frames) visible = pd.DataFrame(visible) return frames, visible