Spaces:
Running
Running
| import polars as pl | |
| from data import data_df | |
| from types import SimpleNamespace | |
| def filter_data_by_date_and_game_kind(data, start_date=None, end_date=None, game_kind=None): | |
| if start_date is not None: | |
| data = data.filter(pl.col('date') >= start_date) | |
| if end_date is not None: | |
| data = data.filter(pl.col('date') <= end_date) | |
| if game_kind is not None: | |
| data = data.filter(pl.col('coarse_game_kind') == game_kind) | |
| return data | |
| def compute_team_games(data): | |
| data = ( | |
| data | |
| .with_columns( | |
| pl.col('gameId').unique().len().over('HomeTeamNameES').alias('home_games'), | |
| pl.col('gameId').unique().len().over('VisitorTeamNameES').alias('visitor_games') | |
| ) | |
| ) | |
| game_data = ( | |
| data | |
| .group_by('HomeTeamNameES') | |
| .first() | |
| [['HomeTeamNameES', 'home_games']] | |
| .rename({'HomeTeamNameES': 'team'}) | |
| .join( | |
| ( | |
| data | |
| .group_by('VisitorTeamNameES') | |
| .first() | |
| [['VisitorTeamNameES', 'visitor_games']] | |
| .rename({'VisitorTeamNameES': 'team'}) | |
| ), | |
| on='team', | |
| ) | |
| .with_columns((pl.col('home_games')+pl.col('visitor_games')).alias('games')) | |
| ) | |
| return ( | |
| data | |
| .drop('home_games', 'visitor_games') | |
| .join( | |
| game_data[['team', 'games']].rename({'games': 'home_games'}), | |
| left_on='HomeTeamNameES', | |
| right_on='team' | |
| ) | |
| .join( | |
| game_data[['team', 'games']].rename({'games': 'visitor_games'}), | |
| left_on='VisitorTeamNameES', | |
| right_on='team' | |
| ) | |
| ) | |
| def compute_pitch_stats(data, player_type, pitch_class_type, min_pitches=1): | |
| assert player_type in ('pitcher', 'batter') | |
| assert pitch_class_type in ('general', 'specific') | |
| id_col = 'pitId' if player_type == 'pitcher' else 'batId' | |
| pitch_col = 'ballKind_code' if pitch_class_type == 'specific' else 'general_ballKind_code' | |
| pitch_name_col = 'ballKind' if pitch_class_type == 'specific' else 'general_ballKind' | |
| pitch_stats = ( | |
| data | |
| .group_by(id_col, pitch_col, 'pitcher_team_name_short') | |
| .agg( | |
| pl.first('pitcher_name'), | |
| *([pl.first('general_ballKind')] if pitch_class_type == 'specific' else []), | |
| pl.first(pitch_name_col), | |
| pl.len().alias('count'), | |
| pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True), | |
| (pl.col('swing').sum() / pl.col('pitch').sum()).alias('Swing%'), | |
| ((pl.col('swing') & pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Z-Swing%'), | |
| ((pl.col('swing') & ~pl.col('zone')).sum() / pl.col('pitch').sum()).alias('Chase%'), | |
| ((pl.col('swing') & ~pl.col('whiff')).sum()/pl.col('swing').sum()).alias('Contact%'), | |
| ((pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(pl.col('zone') & pl.col('swing')).sum()).alias('Z-Contact%'), | |
| ((~pl.col('zone') & pl.col('swing') & ~pl.col('whiff')).sum()/(~pl.col('zone') & pl.col('swing')).sum()).alias('O-Contact%'), | |
| (pl.col('whiff').sum() / pl.col('swing').sum()).alias('Whiff%'), | |
| (pl.col('whiff').sum() / pl.col('pitch').sum()).alias('SwStr%'), | |
| (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'), | |
| (pl.col('zone').sum() / pl.col('pitch').sum()).alias('Zone%'), | |
| (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') < 0).otherwise(pl.col('x') > 0)).mean().alias('Glove%'), | |
| (pl.when(pl.col('pitLR') == 'r').then(pl.col('x') >= 0).otherwise(pl.col('x') <= 0)).mean().alias('Arm%'), | |
| (pl.col('y') > 125).mean().alias('High%'), | |
| (pl.col('y') <= 125).mean().alias('Low%'), | |
| (pl.col('x').is_between(-20, 20) & pl.col('y').is_between(100, 100+50)).mean().alias('MM%') | |
| ) | |
| .with_columns( | |
| (pl.col('count')/pl.sum('count').over('pitId')).alias('usage'), | |
| (pl.col('count') >= min_pitches).alias('qualified') | |
| ) | |
| .explode('batType') | |
| .unnest('batType') | |
| .pivot(on='batType', values='proportion') | |
| .fill_null(0) | |
| .with_columns( | |
| (pl.col('G') + pl.col('B')).alias('GB%'), | |
| (pl.col('F') + pl.col('P')).alias('FB%'), | |
| pl.col('L').alias('LD%').round(2), | |
| ) | |
| .drop('G', 'F', 'B', 'P', 'L', 'null') | |
| .with_columns( | |
| (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=((stat in ['FB%', 'LD%'] or 'Contact%' in stat)))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl') | |
| for stat in ['Swing%', 'Z-Swing%', 'Chase%', 'Contact%', 'Z-Contact%', 'O-Contact%', 'SwStr%', 'Whiff%', 'CSW%', 'GB%', 'FB%', 'LD%', 'Zone%'] | |
| ) | |
| .rename({pitch_col: 'ballKind_code', pitch_name_col: 'ballKind'} if pitch_class_type == 'general' else {}) | |
| .sort(id_col, 'count', descending=[False, True]) | |
| ) | |
| return pitch_stats | |
| def get_pitcher_stats(id, lr=None, game_kind=None, start_date=None, end_date=None, min_ip=1, min_pitches=1, pitch_class_type='specific'): | |
| source_data = data_df.filter(pl.col('ballKind_code') != '-') | |
| # if start_date is not None: | |
| # source_data = source_data.filter(pl.col('date') >= start_date) | |
| # if end_date is not None: | |
| # source_data = source_data.filter(pl.col('date') <= end_date) | |
| # | |
| # if game_kind is not None: | |
| # source_data = source_data.filter(pl.col('coarse_game_kind') == game_kind) | |
| source_data = filter_data_by_date_and_game_kind(source_data, start_date=start_date, end_date=end_date, game_kind=game_kind) | |
| source_data = ( | |
| compute_team_games(source_data) | |
| .with_columns( | |
| pl.when(pl.col('half_inning').str.ends_with('1')).then('home_games').otherwise('visitor_games').first().over('pitId').alias('games'), | |
| pl.col('inning_code').unique().len().over('pitId').alias('IP') | |
| ) | |
| ) | |
| if min_ip == 'qualified': | |
| source_data = source_data.with_columns((pl.col('IP') >= pl.col('games')).alias('qualified')) | |
| else: | |
| source_data = source_data.with_columns((pl.col('IP') >= min_ip).alias('qualified')) | |
| if lr is not None: | |
| source_data = source_data.filter(pl.col('batLR') == lr) | |
| pitch_stats = compute_pitch_stats(source_data, player_type='pitcher', pitch_class_type=pitch_class_type, min_pitches=min_pitches).filter(pl.col('pitId') == id) | |
| pitch_shapes = ( | |
| source_data | |
| .filter( | |
| (pl.col('pitId') == id) & | |
| pl.col('x').is_not_null() & | |
| pl.col('y').is_not_null() & | |
| (pl.col('ballSpeed') > 0) | |
| ) | |
| [['pitId', 'general_ballKind_code', 'ballKind_code', 'ballSpeed', 'x', 'y']] | |
| ) | |
| pitcher_stats = ( | |
| source_data | |
| .group_by('pitId') | |
| .agg( | |
| pl.col('pitcher_name').first(), | |
| (pl.when(pl.col('presult').str.contains('strikeout')).then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('K%'), | |
| (pl.when(pl.col('presult') == 'Walk').then(1).otherwise(0).sum() / pl.col('pa_code').unique().len()).alias('BB%'), | |
| (pl.col('csw').sum() / pl.col('pitch').sum()).alias('CSW%'), | |
| pl.col('aux_bresult').struct.field('batType').drop_nulls().value_counts(normalize=True), | |
| pl.first('qualified') | |
| ) | |
| .explode('batType') | |
| .unnest('batType') | |
| .pivot(on='batType', values='proportion') | |
| .fill_null(0) | |
| .with_columns( | |
| (pl.col('G') + pl.col('B')).alias('GB%'), | |
| (pl.col('F') + pl.col('P')).alias('FB%'), | |
| pl.col('L').alias('LD%'), | |
| ) | |
| .drop('G', 'F', 'B', 'P', 'L') | |
| .with_columns( | |
| (pl.when(pl.col('qualified')).then(pl.col(stat)).rank(descending=(stat == 'BB%'))/pl.when(pl.col('qualified')).then(pl.col(stat)).count()).alias(f'{stat}_pctl') | |
| for stat in ['CSW%', 'K%', 'BB%', 'GB%'] | |
| ) | |
| .filter(pl.col('pitId') == id) | |
| ) | |
| return SimpleNamespace(pitcher_stats=pitcher_stats, pitch_stats=pitch_stats, pitch_shapes=pitch_shapes) | |