| import os | |
| from datasets import load_dataset | |
| CACHE_DIR = 'cache' | |
| N_SAMPLES = 15 | |
| REMOVED_COMMITS = ['9cc896202dc38d962c01aa2637dbc5bbc3e3dd9b'] | |
| def load_data(): | |
| df = load_dataset("JetBrains-Research/commit-rewriting-samples", | |
| split="train", | |
| token=os.environ.get('HF_REWRITING_TOKEN'), | |
| cache_dir=CACHE_DIR).to_pandas() | |
| removed_idx = df['hash'].isin(REMOVED_COMMITS) | |
| df = df[~removed_idx] | |
| return df.to_dict('records')[:N_SAMPLES] | |