| from datasets import load_dataset | |
| MODEL = 'cmg_gpt_4_0613' | |
| CACHE_DIR = 'cache' | |
| def load_data(): | |
| dataset = load_dataset("JetBrains-Research/lca-cmg", | |
| "commitchronicle-py-long", | |
| split="test", | |
| cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename( | |
| columns={'message': 'reference'}) | |
| model_dataset = load_dataset("JetBrains-Research/lca-results", | |
| MODEL, | |
| split="test", | |
| cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]] | |
| model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')] | |
| dataset = dataset.join(other=model_dataset) | |
| return dataset.reset_index().to_dict('records') | |