patrickramos commited on
Commit
fc8fe72
·
1 Parent(s): 9f46512

Fix player name translation

Browse files
Files changed (1) hide show
  1. data.py +125 -107
data.py CHANGED
@@ -47,109 +47,6 @@ for season in tqdm(SEASONS):
47
  _aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
48
  aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))
49
 
50
-
51
- def select_name(names):
52
- '''
53
- When given mutiple names,
54
- prioritizes the name with ASCII characters (ex. R. マルティネス > マルティネス),
55
- followed by the shorter name (ex. 大勢 > 翁田 大勢)
56
- Names with ASCII characters help differentiate between foreign players,
57
- whlie shorter names are more accurate for players going by shorter names
58
- '''
59
- lens = []
60
- for name in names:
61
- if any([char in ascii_letters for char in name]):
62
- return name
63
- else:
64
- lens.append(len(name))
65
- return names[np.argmin(lens).item()]
66
-
67
- # load player dfs
68
- players_df = (
69
- pl.read_parquet('files/players.parquet')
70
- .with_columns(pl.col('playerName').str.normalize('NFKC').str.replace_all('・', ' '))
71
- .group_by('playerId').agg(pl.col('playerName').map_elements(select_name, return_dtype=pl.String))
72
- )
73
- translated_df = (
74
- pl.read_parquet('files/players_translated.parquet')
75
- .with_columns(pl.col('name_jp').str.normalize('NFKC').str.replace_all('・', ' '))
76
- ['name_jp', 'name_kana', 'name_en']
77
- .unique()
78
- )
79
- manual_translated_df = pl.read_parquet('files/players_translated_manual.parquet')
80
-
81
- # names with no romanization are approximated with kana translation
82
- kks = pykakasi.kakasi()
83
-
84
- # take names in parenthesis when they contain an ascii character
85
- translated_df = (
86
- translated_df
87
- .with_columns(
88
- pl.when(pl.col('name_jp').str.contains(r'\('))
89
- .then(pl.col('name_jp').str.extract(r'.*\(', 0).str.strip_chars_end(' ('))
90
- .otherwise(pl.col('name_jp'))
91
- .str.replace_all('・', ' ')
92
- .alias('name_jp')
93
- )
94
- .with_columns(pl.col('name_kana').str.normalize('NFKC').str.replace_all('・', ' '))
95
- .with_columns(pl.col('name_kana').str.extract(r'\(.*\)', 0).str.strip_chars('()').alias('in_parentheses'))
96
- .with_columns(pl.col('name_kana').str.extract(r'.*\(', 0).str.strip_chars_end('(').alias('before_parentheses'))
97
- .with_columns(
98
- pl.when(pl.col('name_en').is_null())
99
- .then
100
- (
101
- pl.when(pl.col('in_parentheses').is_not_null() | pl.col('before_parentheses').is_not_null())
102
- .then(
103
- pl.when(pl.col('in_parentheses').map_elements(lambda name: any([char in ascii_letters for char in name]), pl.Boolean))
104
- .then(pl.col('in_parentheses'))
105
- .otherwise(pl.col('before_parentheses'))
106
- )
107
- .otherwise(pl.col('name_kana').map_elements(lambda name: ''.join([word['hepburn'].capitalize() for word in kks.convert(name)]), return_dtype=pl.String))
108
- )
109
- .otherwise(pl.col('name_en'))
110
- .alias('name_en')
111
- )
112
- .with_columns(pl.col('name_en').str.replace_all(',', '').str.to_titlecase())
113
- )
114
-
115
- # handle inconsistent kanji between sources
116
- for old_char, new_char in [
117
- ('崎', '﨑'),
118
- ('高', '髙'),
119
- ('徳', '德'),
120
- ('濱', '濵'),
121
- ('瀬', '瀨')
122
- ]:
123
- players_df = (
124
- players_df.with_columns(
125
- pl.when(~pl.col('playerName').is_in(translated_df['name_jp']))
126
- .then(pl.col('playerName').str.replace(old_char, new_char))
127
- .otherwise('playerName')
128
- )
129
- )
130
-
131
- # merge player dfs
132
- players_df = (
133
- players_df
134
- .join(manual_translated_df.rename({'name_en': 'name_en_manual'}), on='playerId', how='left')
135
- .join(
136
- (
137
- translated_df
138
- .with_columns(
139
- pl.when(pl.col('name_jp').str.contains(r'\.') & ~pl.col('name_jp').is_in(players_df.filter(pl.len().over('playerName') == 1)['playerName']))
140
- .then(pl.col('name_jp').str.strip_chars(ascii_letters+'.'))
141
- .otherwise('name_jp')
142
- )
143
- [['name_jp', 'name_en']]
144
- ),
145
- left_on='playerName', right_on='name_jp', how='left'
146
- )
147
- .with_columns(pl.coalesce('name_en_manual', 'name_en').alias('name_en'))
148
- .unique() # remove duplicates from names with multiple matches in other dataframes
149
- .drop('name_en_manual', 'name_jp')
150
- # .filter(pl.col('name_en').is_null())
151
- )
152
-
153
  aux_df = (
154
  aux_df
155
  .filter(pl.col('type') != 'RUNNER')
@@ -257,9 +154,9 @@ data_df = (
257
  on='universal_code',
258
  how='left'
259
  )
260
- .join(
261
- players_df.rename({'name_en': 'pitcher_name'}), left_on='pitId', right_on='playerId', how='left'
262
- )
263
  .join(
264
  text_df[['GameID', 'GameKindID']].with_columns(
265
  pl.col('GameID').cast(pl.Int32),
@@ -294,7 +191,10 @@ data_df = (
294
  .alias('coarse_game_kind'),
295
 
296
  pl.when(pl.col('half_inning').str.ends_with(1)).then('HomeTeamNameES').otherwise('VisitorTeamNameES').alias('pitcher_team'),
297
- pl.when(pl.col('half_inning').str.ends_with(1)).then('home_team_name_short').otherwise('visitor_team_name_short').alias('pitcher_team_name_short')
 
 
 
298
  )
299
  .with_columns(
300
  pl.col('presult_id').replace_strict(presult).alias('presult')
@@ -314,5 +214,123 @@ data_df = (
314
  .filter(pl.col('ballKind_code') != '-')
315
  )
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  if __name__ == '__main__':
318
  breakpoint()
 
47
  _aux_sched_df = pl.read_parquet(os.path.join(DATA_PATH, str(season), 'aux_schedule.parquet'))
48
  aux_sched_df = pl.concat((aux_sched_df, _aux_sched_df))
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  aux_df = (
51
  aux_df
52
  .filter(pl.col('type') != 'RUNNER')
 
154
  on='universal_code',
155
  how='left'
156
  )
157
+ # .join(
158
+ # players_df.rename({'name_en': 'pitcher_name'}), left_on='pitId', right_on='playerId', how='left'
159
+ # )
160
  .join(
161
  text_df[['GameID', 'GameKindID']].with_columns(
162
  pl.col('GameID').cast(pl.Int32),
 
191
  .alias('coarse_game_kind'),
192
 
193
  pl.when(pl.col('half_inning').str.ends_with(1)).then('HomeTeamNameES').otherwise('VisitorTeamNameES').alias('pitcher_team'),
194
+ pl.when(pl.col('half_inning').str.ends_with(1)).then('home_team_name_short').otherwise('visitor_team_name_short').alias('pitcher_team_name_short'),
195
+
196
+ pl.when(pl.col('half_inning').str.ends_with(2)).then('HomeTeamNameES').otherwise('VisitorTeamNameES').alias('batter_team'),
197
+ pl.when(pl.col('half_inning').str.ends_with(2)).then('home_team_name_short').otherwise('visitor_team_name_short').alias('batter_team_name_short')
198
  )
199
  .with_columns(
200
  pl.col('presult_id').replace_strict(presult).alias('presult')
 
214
  .filter(pl.col('ballKind_code') != '-')
215
  )
216
 
217
+
218
+ def select_name(names):
219
+ '''
220
+ When given mutiple names,
221
+ prioritizes the name with ASCII characters (ex. R. マルティネス > マルティネス),
222
+ followed by the shorter name (ex. 大勢 > 翁田 大勢)
223
+ Names with ASCII characters help differentiate between foreign players,
224
+ whlie shorter names are more accurate for players going by shorter names
225
+ '''
226
+ lens = []
227
+ for name in names:
228
+ if any([char in ascii_letters for char in name]):
229
+ return name
230
+ else:
231
+ lens.append(len(name))
232
+ return names[np.argmin(lens).item()]
233
+
234
+
235
+ # load player dfs
236
+ players_df = (
237
+ pl.read_parquet('files/players.parquet')
238
+ .with_columns(pl.col('playerName').str.normalize('NFKC').str.replace_all('・', ' '))
239
+ .group_by('playerId').agg(pl.col('playerName').map_elements(select_name, return_dtype=pl.String))
240
+ )
241
+ translated_df = (
242
+ pl.read_parquet('files/players_translated.parquet')
243
+ .with_columns(pl.col('name_jp').str.normalize('NFKC').str.replace_all('・', ' '))
244
+ # ['name_jp', 'name_kana', 'name_en']
245
+ )
246
+ manual_translated_df = pl.read_parquet('files/players_translated_manual.parquet')
247
+
248
+ # get seasons and teams per player id
249
+ batter_df = (
250
+ data_df
251
+ .with_columns(pl.col('date').dt.year().alias('season'))
252
+ .unique(['batId', 'batter_team', 'season'])
253
+ ['batId', 'batter_team', 'season']
254
+ .rename({'batId': 'playerId', 'batter_team': 'team'})
255
+ )
256
+ pitcher_df = (
257
+ data_df
258
+ .with_columns(pl.col('date').dt.year().alias('season'))
259
+ .unique(['pitId', 'pitcher_team', 'season'])
260
+ ['pitId', 'pitcher_team', 'season']
261
+ .rename({'pitId': 'playerId', 'pitcher_team': 'team'})
262
+ )
263
+ players_df = players_df.join(pl.concat((pitcher_df, batter_df)).unique(), on='playerId')
264
+
265
+ # names with no romanization are approximated with kana translation
266
+ kks = pykakasi.kakasi()
267
+
268
+ # take names in parenthesis when they contain an ascii character
269
+ translated_df = (
270
+ translated_df
271
+ .with_columns(
272
+ pl.when(pl.col('name_jp').str.contains(r'\('))
273
+ .then(pl.col('name_jp').str.extract(r'.*\(', 0).str.strip_chars_end(' ('))
274
+ .otherwise(pl.col('name_jp'))
275
+ .str.replace_all('・', ' ')
276
+ .alias('name_jp')
277
+ )
278
+ .with_columns(pl.col('name_kana').str.normalize('NFKC').str.replace_all('・', ' '))
279
+ .with_columns(pl.col('name_kana').str.extract(r'\(.*\)', 0).str.strip_chars('()').alias('in_parentheses'))
280
+ .with_columns(pl.col('name_kana').str.extract(r'.*\(', 0).str.strip_chars_end('(').alias('before_parentheses'))
281
+ .with_columns(
282
+ pl.when(pl.col('name_en').is_null())
283
+ .then
284
+ (
285
+ pl.when(pl.col('in_parentheses').is_not_null() | pl.col('before_parentheses').is_not_null())
286
+ .then(
287
+ pl.when(pl.col('in_parentheses').map_elements(lambda name: any([char in ascii_letters for char in name]), pl.Boolean))
288
+ .then(pl.col('in_parentheses'))
289
+ .otherwise(pl.col('before_parentheses'))
290
+ )
291
+ .otherwise(pl.col('name_kana').map_elements(lambda name: ''.join([word['hepburn'].capitalize() for word in kks.convert(name)]), return_dtype=pl.String))
292
+ )
293
+ .otherwise(pl.col('name_en'))
294
+ .alias('name_en')
295
+ )
296
+ .with_columns(pl.col('name_en').str.replace_all(',', '').str.to_titlecase())
297
+ )
298
+
299
+ # handle inconsistent kanji between sources
300
+ for old_char, new_char in [
301
+ ('崎', '﨑'),
302
+ ('高', '髙'),
303
+ ('徳', '德'),
304
+ ('濱', '濵'),
305
+ ('瀬', '瀨')
306
+ ]:
307
+ players_df = (
308
+ players_df.with_columns(
309
+ pl.when(~pl.col('playerName').is_in(translated_df['name_jp']))
310
+ .then(pl.col('playerName').str.replace(old_char, new_char))
311
+ .otherwise('playerName')
312
+ )
313
+ )
314
+
315
+ # merge player dfs
316
+ players_df = (
317
+ players_df
318
+ .join(
319
+ translated_df
320
+ .with_columns(
321
+ pl.when(pl.col('name_jp').str.contains(r'\.') & ~pl.col('name_jp').is_in(players_df['playerName'].implode()))
322
+ .then(pl.col('name_jp').str.strip_chars(ascii_letters+'.'))
323
+ .otherwise('name_jp')
324
+ )
325
+ [['name_jp', 'name_en', 'team', 'season']],
326
+ left_on=['playerName', 'season', 'team'],
327
+ right_on=['name_jp', 'season', 'team']
328
+ )
329
+ )
330
+ print(players_df.filter(pl.len().over('playerId', 'team', 'season') > 1))
331
+ players_df = pl.concat((players_df.group_by('playerId').agg(pl.first('name_en')), manual_translated_df[['playerId', 'name_en']]))
332
+
333
+ # join players to data
334
+ data_df = data_df.join(players_df.rename({'name_en': 'pitcher_name'}), left_on='pitId', right_on='playerId', how='left')
335
  if __name__ == '__main__':
336
  breakpoint()