Spaces:
Running
Running
| import os | |
| import json | |
| import glob | |
| import pandas as pd | |
| # Define task, subtask, and dataset mapping | |
| TASK_MAPPING = { | |
| "MasakhaPOS": ("NLU", "POS"), | |
| "MasakhaNER": ("NLU", "NER"), | |
| "AfriSenti": ("NLU", "Senti"), | |
| "NollySenti": ("NLU", "Senti"), | |
| "InjongoIntent": ("NLU", "Intent"), | |
| "MasakhaNEWS": ("NLU", "Topic"), | |
| "SIB": ("NLU", "Topic"), | |
| "AfriHate": ("NLU", "Hate"), | |
| "AfriXNLI": ("NLU", "NLI"), | |
| "AfriQA": ("QA", "XQA"), | |
| "Belebele": ("QA", "RC"), | |
| "NaijaRC": ("QA", "RC"), | |
| "UHURA": ("Knowledge", "Arc-E"), | |
| "OpenAIMMLU": ("Knowledge", "MMLU"), | |
| "AfriMMLU": ("Knowledge", "MMLU"), | |
| "AfriMGSM": ("Reasoning", "Math"), | |
| "SALT - en_xx": ("NLG", "MT(en/fr-xx)"), | |
| "SALT - xx_en": ("NLG", "MT(xx-en/fr)"), | |
| "Flores - en_xx": ("NLG", "MT(en/fr-xx)"), | |
| "Flores - xx_en": ("NLG", "MT(xx-en/fr)"), | |
| "MAFAND - en_xx": ("NLG", "MT(en/fr-xx)"), | |
| "MAFAND - xx_en": ("NLG", "MT(xx-en/fr)"), | |
| "NTREX - en_xx": ("NLG", "MT(en/fr-xx)"), | |
| "NTREX - xx_en": ("NLG", "MT(xx-en/fr)"), | |
| "XLSUM": ("NLG", "SUMM"), | |
| "ADR": ("NLG", "ADR"), | |
| "RC": ("QA", "RC"), | |
| "Sentiment": ("NLU", "Senti"), | |
| "TC": ("NLU", "Topic"), | |
| "MMLU": ("Knowledge", "MMLU"), | |
| "MT - xx-en": ("NLG", "MT(xx-en/fr)"), | |
| "MT - en-xx": ("NLG", "MT(en/fr-xx)"), | |
| } | |
| MODEL_MAP = { | |
| "AfroLlama-V1": "AfroLLaMa 8B", | |
| "LLaMAX3-8B-Alpaca": "LLaMAX3 8B", | |
| "Llama-2-7b-chat-hf": "LLaMa2 7b", | |
| "Llama-3.1-70B-Instruct": "LLaMa3.1 70B", | |
| "Llama-3.1-8B-Instruct": "LLaMa3.1 8B", | |
| "Meta-Llama-3-8B-Instruct": "LLaMa3 8B", | |
| "aya-101": "Aya-101 13B", | |
| "gemma-1.1-7b-it": "Gemma1.1 7b", | |
| "gemma-2-27b-it": "Gemma2 27b", | |
| "gemma-2-9b-it": "Gemma2 9b", | |
| "gemini-1.5-pro-002": "Gemini 1.5 pro", | |
| "gpt-4o-2024-08-06": "GPT-4o (Aug)", | |
| "Gemma 2 IT 27B": "Gemma2 27b", | |
| "Gemma 2 IT 9B": "Gemma2 9b", | |
| "Aya-101": "Aya-101 13B", | |
| "Meta-Llama-3.1-70B-Instruct": "LLaMa3.1 70B", | |
| "LLaMAX3-8B": "LLaMAX3 8B", | |
| "LLaMaX 3 8B": "LLaMAX3 8B", | |
| "Meta-Llama-3-70B-Instruct": "LLaMa3.1 70B" | |
| } | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # this points to /data | |
| results_dir = os.path.join(BASE_DIR, "results") | |
| community_results = os.path.join(BASE_DIR, "community_results") | |
| output_direc = os.path.join(BASE_DIR, "leaderboard_json") | |
| def generate_json_files(results=results_dir, community_result=community_results, | |
| output_dir=output_direc, leaderboard=None): | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Dictionary to store either per-task JSON data or the leaderboard | |
| task_data = {} | |
| leaderboard_data = {} | |
| task_map = {key.lower(): value for key, value in TASK_MAPPING.items()} | |
| afrobench_tasks = ["MasakhaPOS", "MasakhaNER", "Sentiment", "TC", "InjongoIntent", "AfriHate", "AfriXNLI", | |
| "AfriQA", "UHURA", "RC", "MMLU", "AfriMGSM", "MT - en-xx", "MT - xx-en", "XLSUM", "ADR"] | |
| afrobench_tasks = [task.lower() for task in afrobench_tasks] | |
| afrobench_lite_datasets = ["injongointent", "sib", "afrixnli", "belebele", "afrimmlu", "afrimgsm", | |
| "flores - en_xx"] | |
| afrobench_lite_languages = ["amh", "hau", "ibo", "kin", "lin", "lug", "orm", "sna", "sot", "swa", "xho", "yor", "zul", "wol"] | |
| # Process each CSV file | |
| for filename in os.listdir(results): | |
| if filename.endswith(".csv") and '- June2025.csv' not in filename: | |
| file_path = os.path.join(results, filename) | |
| dataset_name = filename.replace(" - 0-shot.csv", "").replace(" 0-shot.csv", "") | |
| # Identify task & subtask | |
| task_info = task_map.get(dataset_name.lower()) | |
| if not task_info: | |
| print(f"Skipping unmapped dataset: {dataset_name.lower()}") | |
| continue | |
| task, subtask = task_info | |
| # Read CSV | |
| df = pd.read_csv(file_path) | |
| drop_col = [i for i in df.columns if 'unnamed' in i.lower()] | |
| df.drop(drop_col, axis=1, inplace=True) | |
| # Standardize model names | |
| df.loc[df["model"].str.contains("LLaMaX", case=False), "model"] = "LLaMaX 3 8B" | |
| df = df[df["model"] != "InkubaLM-0.4B"].copy() | |
| df = df[df["model"] != "Claude 3.5 Sonnet"].copy() | |
| df.loc[df["model"].str.contains("gpt", case=False), "model"] = "gpt-4o-2024-08-06" | |
| df.loc[df["model"].str.contains("gemini", case=False), "model"] = "gemini-1.5-pro-002" | |
| df["model"] = df["model"].map(MODEL_MAP) | |
| # Extract models | |
| models = df["model"].unique() | |
| all_columns = list(df.columns) | |
| meta_columns = ["model", "prompt", "avg_score", "avg"] | |
| language_columns = [col for col in all_columns if col not in meta_columns] | |
| language_columns = [col for col in language_columns if col.lower() not in {"eng", "fra", | |
| "eng_latn, fra_latn", "en", "fr"}] | |
| avg_col = "avg" if "avg" in df.columns else "avg_score" | |
| if leaderboard == "afrobench": | |
| # Initialize leaderboard structure | |
| if dataset_name.lower() not in afrobench_tasks: | |
| continue | |
| if task not in leaderboard_data: | |
| leaderboard_data[task] = {} | |
| if subtask not in leaderboard_data[task]: | |
| leaderboard_data[task][subtask] = {"datasets": {}} | |
| # Store per-model dataset scores | |
| dataset_scores = {} | |
| for model in models: | |
| best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
| scores = [best_avg_row[col] for col in language_columns if col in best_avg_row] | |
| dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1) | |
| leaderboard_data[task][subtask]["datasets"][dataset_name] = dataset_scores | |
| elif leaderboard == "afrobench_lite": | |
| if 'lite_language_scores' not in locals(): | |
| lite_language_scores = {} | |
| if dataset_name in afrobench_lite_datasets: | |
| if subtask not in leaderboard_data: | |
| leaderboard_data[subtask] = {} | |
| # Store per-model dataset scores | |
| dataset_scores = {} | |
| for model in models: | |
| df.fillna(0, inplace=True) | |
| best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
| scores = [best_avg_row[[c for c in best_avg_row.index if c.split('_')[0] == lang][0]] for lang | |
| in afrobench_lite_languages if | |
| any(c.split('_')[0] == lang for c in best_avg_row.index)] | |
| dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, | |
| 1) # Avoid division by zero | |
| if model not in lite_language_scores: | |
| lite_language_scores[model] = {} | |
| for lang in afrobench_lite_languages: | |
| if lang in df.columns: | |
| val = df.loc[df["model"] == model, lang].values | |
| if val.size > 0: | |
| lite_language_scores[model].setdefault(lang, []).append(val[0]) | |
| df = pd.read_csv(os.path.join(community_result, "New Results - June2025.csv")) | |
| df = df[df['task'] == dataset_name] | |
| df.fillna(0, inplace=True) | |
| models = df["model"].unique() | |
| for model in models: | |
| scores = [df.loc[df["model"] == model, col].values[0] for col in afrobench_lite_languages | |
| if col in df.columns] | |
| dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1) | |
| if model not in lite_language_scores: | |
| lite_language_scores[model] = {} | |
| for lang in afrobench_lite_languages: | |
| if lang in df.columns: | |
| val = df.loc[df["model"] == model, lang].values | |
| if val.size > 0: | |
| lite_language_scores[model].setdefault(lang, []).append(val[0]) | |
| leaderboard_data[subtask][dataset_name] = dataset_scores | |
| else: | |
| # Initialize task & subtask structure | |
| if task not in task_data: | |
| task_data[task] = {"task": task, "subtasks": {}} | |
| if subtask not in task_data[task]["subtasks"]: | |
| task_data[task]["subtasks"][subtask] = {"datasets": {}} | |
| # Store per-task dataset data | |
| task_data[task]["subtasks"][subtask]["datasets"][dataset_name] = { | |
| "languages": language_columns, | |
| "scores": {} | |
| } | |
| for model in models: | |
| best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
| model_scores = [round(score, 1) for score in best_avg_row[language_columns].to_list()] | |
| task_data[task]["subtasks"][subtask]["datasets"][dataset_name]["scores"][model] = model_scores | |
| # Save leaderboard JSON if enabled | |
| if leaderboard: | |
| output_path = os.path.join(output_dir, f"{leaderboard}.json") | |
| with open(output_path, "w", encoding="utf-8") as json_file: | |
| json.dump(leaderboard_data, json_file, indent=4) | |
| print("Leaderboard JSON generated successfully!") | |
| if leaderboard == "afrobench_lite": | |
| lang_output = os.path.join(output_dir, "lite_language_scores.json") | |
| averaged_scores = { | |
| model: { | |
| lang: round(sum(scores) / len(scores), 1) | |
| for lang, scores in langs.items() | |
| } | |
| for model, langs in lite_language_scores.items() | |
| } | |
| with open(lang_output, "w", encoding="utf-8") as f: | |
| json.dump(averaged_scores, f, indent=4) | |
| print("Saved language version for lite") | |
| # Save per-task JSON files if leaderboard=False | |
| else: | |
| for task, data in task_data.items(): | |
| output_path = os.path.join(output_dir, f"{task.lower().replace(' ', '_')}.json") | |
| with open(output_path, "w", encoding="utf-8") as json_file: | |
| json.dump(data, json_file, indent=4) | |
| print("Task-wise JSON files with subtasks generated successfully!") | |
| generate_json_files(leaderboard="afrobench_lite") | |