open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Feb 5, 2024

Commit

c0fa950

1 Parent(s): 0266144

Portuguese Tasks configs and fix bugs

Browse files

Files changed (6) hide show

.gitignore +4 -1
src/display/utils.py +14 -12
src/envs.py +34 -15
src/tools/plots.py +1 -1
tasks_config.yaml → tasks_config/legal_config.yaml +0 -0
tasks_config/pt_config.yaml +153 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,6 @@
 venv/
 __pycache__/
-.env
 .ipynb_checkpoints
 *ipynb
 .vscode/
@@ -12,5 +12,8 @@ original_results/
 eval-queue/
 eval-results/
 dynamic-info/
 src/assets/model_counts.html

 venv/
 __pycache__/
+.env*
 .ipynb_checkpoints
 *ipynb
 .vscode/
 eval-queue/
 eval-results/
 dynamic-info/
+downloads/
+tasks_config/legal_config.yaml
 src/assets/model_counts.html

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 from typing import List
 import pandas as pd
 from yaml import safe_load
-from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -15,16 +15,17 @@ class Task:
     metric: str
     col_name: str
     baseline: float = 0.0
-    human_baseline: float = 0.0
     few_shot: int = None
     limit: int = None
     task_list: List[str] = None
     link: str = None
     description: str = None
-with open('tasks_config.yaml', 'r', encoding='utf-8') as f:
-    tasks_config = safe_load(f)
-    Tasks = Enum('Tasks', {k: Task(**v) for k, v in tasks_config['tasks'].items()})
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
@@ -108,11 +109,12 @@ baseline_row = {
 baseline_list = []
 for task in Tasks:
     baseline_row[task.value.col_name] = task.value.baseline
-    if task.value.baseline is not None:
         baseline_list.append(task.value.baseline)
 baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
-if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
-    baseline_row["original_benchmark_average"] = None
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -145,17 +147,17 @@ human_baseline_row = {
     AutoEvalColumn.license.name: "",
     AutoEvalColumn.still_on_hub.name: False,
     AutoEvalColumn.moe.name: False,
-    AutoEvalColumn.eval_time.name: 0.0
 }
 baseline_list = []
 for task in Tasks:
     human_baseline_row[task.value.col_name] = task.value.human_baseline
-    if task.value.human_baseline is not None:
         baseline_list.append(task.value.human_baseline)
 human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
-if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
-    human_baseline_row["original_benchmark_average"] = None
 @dataclass
 class ModelDetails:

 from typing import List
 import pandas as pd
 from yaml import safe_load
+from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     metric: str
     col_name: str
     baseline: float = 0.0
+    human_baseline: float = None
+    expert_human_baseline: float = None
     few_shot: int = None
     limit: int = None
     task_list: List[str] = None
     link: str = None
     description: str = None
+    sources: List[str] = None
+    baseline_sources: List[str] = None
+Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 baseline_list = []
 for task in Tasks:
     baseline_row[task.value.col_name] = task.value.baseline
+    if task.value.baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
         baseline_list.append(task.value.baseline)
 baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
+#if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+baseline_row["🤗 Leaderboard Average"] = None
 # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
     AutoEvalColumn.license.name: "",
     AutoEvalColumn.still_on_hub.name: False,
     AutoEvalColumn.moe.name: False,
+    AutoEvalColumn.eval_time.name: 0.0,
 }
 baseline_list = []
 for task in Tasks:
     human_baseline_row[task.value.col_name] = task.value.human_baseline
+    if task.value.human_baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
         baseline_list.append(task.value.human_baseline)
 human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
+#if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
+human_baseline_row["🤗 Leaderboard Average"] = None
 @dataclass
 class ModelDetails:

src/envs.py CHANGED Viewed

@@ -1,25 +1,44 @@
 import os
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data
-H4_TOKEN = os.environ.get("H4_TOKEN", None)
-LEADERBOARD_NAME = os.getenv("LEADERBOARD_NAME", "Open LLM Leaderboard")
-REPO_ID = os.getenv("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
-QUEUE_REPO = os.getenv("QUEUE_REPO", "open-llm-leaderboard/requests")
-DYNAMIC_INFO_REPO = os.getenv("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
-RESULTS_REPO = os.getenv("RESULTS_REPO", "open-llm-leaderboard/results")
 PRIVATE_QUEUE_REPO = QUEUE_REPO
 PRIVATE_RESULTS_REPO = RESULTS_REPO
 #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
 #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
-IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
-CACHE_PATH=os.getenv("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
@@ -29,18 +48,18 @@ DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
-PATH_TO_COLLECTION = os.getenv("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
 # Rate limit variables
-RATE_LIMIT_PERIOD = int(os.getenv("RATE_LIMIT_PERIOD", 7))
-RATE_LIMIT_QUOTA = int(os.getenv("RATE_LIMIT_QUOTA", 5))
-HAS_HIGHER_RATE_LIMIT = os.environ.get("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
-TRUST_REMOTE_CODE = bool(os.getenv("TRUST_REMOTE_CODE", False))
 #Set if you want to get an extra field with the average eval results from the HF leaderboard
-GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(os.getenv("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
-ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = os.getenv("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
 ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
 API = HfApi(token=H4_TOKEN)

 import os
+from yaml import safe_load
 from huggingface_hub import HfApi
+TASK_CONFIG_NAME = os.getenv("TASK_CONFIG", "pt_config")
+TASK_CONFIG_PATH = os.path.join('tasks_config', TASK_CONFIG_NAME + ".yaml")
+with open(TASK_CONFIG_PATH, 'r', encoding='utf-8') as f:
+    TASK_CONFIG = safe_load(f)
+def get_config(name, default):
+    res = None
+    if name in os.environ:
+        res = os.environ[name]
+    elif 'config' in TASK_CONFIG:
+        res = TASK_CONFIG['config'].get(name, None)
+    if res is None:
+        return default
+    return res
 # clone / pull the lmeh eval data
+H4_TOKEN = get_config("H4_TOKEN", None)
+LEADERBOARD_NAME = get_config("LEADERBOARD_NAME", "Open LLM Leaderboard")
+REPO_ID = get_config("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
+QUEUE_REPO = get_config("QUEUE_REPO", "open-llm-leaderboard/requests")
+DYNAMIC_INFO_REPO = get_config("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
+RESULTS_REPO = get_config("RESULTS_REPO", "open-llm-leaderboard/results")
+RAW_RESULTS_REPO = get_config("RAW_RESgit sULTS_REPO", None)
 PRIVATE_QUEUE_REPO = QUEUE_REPO
 PRIVATE_RESULTS_REPO = RESULTS_REPO
 #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
 #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
+IS_PUBLIC = bool(get_config("IS_PUBLIC", True))
+CACHE_PATH=get_config("HF_HOME", ".")
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
 EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
+PATH_TO_COLLECTION = get_config("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
 # Rate limit variables
+RATE_LIMIT_PERIOD = int(get_config("RATE_LIMIT_PERIOD", 7))
+RATE_LIMIT_QUOTA = int(get_config("RATE_LIMIT_QUOTA", 5))
+HAS_HIGHER_RATE_LIMIT = get_config("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
+TRUST_REMOTE_CODE = bool(get_config("TRUST_REMOTE_CODE", False))
 #Set if you want to get an extra field with the average eval results from the HF leaderboard
+GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(get_config("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
+ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
 ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
 API = HfApi(token=H4_TOKEN)

src/tools/plots.py CHANGED Viewed

@@ -99,7 +99,7 @@ def create_metric_plot_obj(
     df = df[df["task"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
-    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(

     df = df[df["task"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
+    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(

tasks_config.yaml → tasks_config/legal_config.yaml RENAMED Viewed

File without changes

tasks_config/pt_config.yaml ADDED Viewed

	@@ -0,0 +1,153 @@

+version: 1.0.0
+config:
+  REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
+  QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
+  RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
+  RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
+  DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
+  PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
+  IS_PUBLIC: true
+  LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
+  GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
+  TRUST_REMOTE_CODE: true
+tasks:
+  enem_challenge:
+    benchmark: enem_challenge
+    col_name: ENEM
+    task_list:
+    - enem_challenge
+    metric: acc
+    few_shot: 3
+    limit: null
+    baseline: 20.0 #random baseline
+    #https://www.sejalguem.com/enem
+    #https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
+    human_baseline: 35.0 # ~60 / 180 acertos - nota  ~500
+    expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
+    description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
+      level exam widely applied every year by the Brazilian government to students that
+      wish to undertake a University degree. This dataset contains 1,430 questions that don't require
+      image understanding of the exams from 2010 to 2018, 2022 and 2023."
+    link: https://huggingface.co/datasets/eduagarcia/enem_challenge
+    sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
+    baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
+  bluex:
+    benchmark: bluex
+    col_name: BLUEX
+    task_list:
+    - bluex
+    metric: acc
+    few_shot: 3
+    limit: null
+    baseline: 22.5 #random baseline
+    #https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99
+    #https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4%  - ~77% @ top-.99
+    human_baseline: 50.0
+    expert_human_baseline: 82.5
+    description: "BLUEX is a multimodal dataset consisting of the two leading
+    university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP),
+    spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"
+    link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
+    sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
+    baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
+  oab_exams:
+    benchmark: oab_exams
+    col_name: OAB Exams
+    task_list:
+    - oab_exams
+    metric: acc
+    few_shot: 3
+    limit: null
+    baseline: 25.0 #random baseline
+    #https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
+    # http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
+    # Acertou +70% = 17214 / 638500 = top-97,5%
+    # desvio top-97,5% -> 46 - 70.0% = 24
+    # z score 97,5% ~ 1,9675
+    # desvio padrao estimado -> 12,2
+    # top 99% = 46 + 2,33*12,2 = ~75.0
+    human_baseline: 46.0
+    expert_human_baseline: 75.0
+    description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
+      Association's exams, from 2010 to 2018.
+    link: https://huggingface.co/datasets/eduagarcia/oab_exams
+    sources: ["https://github.com/legal-nlp/oab-exams"]
+    baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
+  assin2_rte:
+    benchmark: assin2_rte
+    col_name: ASSIN2 RTE
+    task_list:
+    - assin2_rte
+    metric: f1_macro
+    few_shot: 15
+    limit: null
+    baseline: 50.0 #random baseline
+    human_baseline: null
+    expert_human_baseline: null
+    description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual -
+    Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN,
+    an evaluation shared task in the scope of the computational processing
+    of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language
+    Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
+    other text (hypothesis)."
+    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
+    sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
+  assin2_sts:
+    benchmark: assin2_sts
+    col_name: ASSIN2 STS
+    task_list:
+    - assin2_sts
+    metric: pearson
+    few_shot: 15
+    limit: null
+    baseline: 0.0 #random baseline
+    human_baseline: null
+    expert_human_baseline: null
+    description: "Same as dataset as above. Semantic Textual Similarity (STS)
+    ‘measures the degree of semantic equivalence between two sentences’."
+    link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
+    sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
+  faquad_nli:
+    benchmark: faquad_nli
+    col_name: FAQUAD NLI
+    task_list:
+    - faquad_nli
+    metric: f1_macro
+    few_shot: 15
+    limit: null
+    baseline: 45.6 #random baseline
+    human_baseline: null
+    expert_human_baseline: null
+    description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the
+    Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of
+    abundant questions sent by academics whose answers are found in available institutional
+    documents in the Brazilian higher education system. It consists of 900 questions about
+    249 reading passages taken from 18 official documents of a computer science college
+    from a Brazilian federal university and 21 Wikipedia articles related to the
+    Brazilian higher education system. FaQuAD-NLI is a modified version of the
+    FaQuAD dataset that repurposes the question answering task as a textual
+    entailment task between a question and its possible answers."
+    link: https://huggingface.co/datasets/ruanchaves/faquad-nli
+    sources: ["https://github.com/liafacom/faquad/"]
+  sparrow_pt:
+    benchmark: sparrow_pt
+    col_name: Sparrow POR
+    task_list:
+    - sparrow_emotion-2021-cortiz-por
+    - sparrow_hate-2019-fortuna-por
+    - sparrow_sentiment-2016-mozetic-por
+    - sparrow_sentiment-2018-brum-por
+    metric: f1_macro
+    few_shot: 15
+    limit: 500
+    baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
+    human_baseline: null
+    expert_human_baseline: null
+    description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding.
+    SPARROW comprises 169 datasets encompassing 64 different languages,
+    this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
+    One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021)
+    and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
+    All were extracted and manually annotated from Twitter/X."
+    link: https://huggingface.co/datasets/UBC-NLP/sparrow
+    sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]