Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Commit 
							
							·
						
						c0fa950
	
1
								Parent(s):
							
							0266144
								
Portuguese Tasks configs and fix bugs
Browse files- .gitignore +4 -1
- src/display/utils.py +14 -12
- src/envs.py +34 -15
- src/tools/plots.py +1 -1
- tasks_config.yaml → tasks_config/legal_config.yaml +0 -0
- tasks_config/pt_config.yaml +153 -0
    	
        .gitignore
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 | 
             
            venv/
         | 
| 2 | 
             
            __pycache__/
         | 
| 3 | 
            -
            .env
         | 
| 4 | 
             
            .ipynb_checkpoints
         | 
| 5 | 
             
            *ipynb
         | 
| 6 | 
             
            .vscode/
         | 
| @@ -12,5 +12,8 @@ original_results/ | |
| 12 | 
             
            eval-queue/
         | 
| 13 | 
             
            eval-results/
         | 
| 14 | 
             
            dynamic-info/
         | 
|  | |
|  | |
|  | |
| 15 |  | 
| 16 | 
             
            src/assets/model_counts.html
         | 
|  | |
| 1 | 
             
            venv/
         | 
| 2 | 
             
            __pycache__/
         | 
| 3 | 
            +
            .env*
         | 
| 4 | 
             
            .ipynb_checkpoints
         | 
| 5 | 
             
            *ipynb
         | 
| 6 | 
             
            .vscode/
         | 
|  | |
| 12 | 
             
            eval-queue/
         | 
| 13 | 
             
            eval-results/
         | 
| 14 | 
             
            dynamic-info/
         | 
| 15 | 
            +
            downloads/
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            tasks_config/legal_config.yaml
         | 
| 18 |  | 
| 19 | 
             
            src/assets/model_counts.html
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ from enum import Enum | |
| 3 | 
             
            from typing import List
         | 
| 4 | 
             
            import pandas as pd
         | 
| 5 | 
             
            from yaml import safe_load
         | 
| 6 | 
            -
            from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
         | 
| 7 |  | 
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| @@ -15,16 +15,17 @@ class Task: | |
| 15 | 
             
                metric: str
         | 
| 16 | 
             
                col_name: str
         | 
| 17 | 
             
                baseline: float = 0.0
         | 
| 18 | 
            -
                human_baseline: float =  | 
|  | |
| 19 | 
             
                few_shot: int = None
         | 
| 20 | 
             
                limit: int = None
         | 
| 21 | 
             
                task_list: List[str] = None
         | 
| 22 | 
             
                link: str = None
         | 
| 23 | 
             
                description: str = None
         | 
|  | |
|  | |
| 24 |  | 
| 25 | 
            -
             | 
| 26 | 
            -
                tasks_config = safe_load(f)
         | 
| 27 | 
            -
                Tasks = Enum('Tasks', {k: Task(**v) for k, v in tasks_config['tasks'].items()})
         | 
| 28 |  | 
| 29 | 
             
            # These classes are for user facing column names,
         | 
| 30 | 
             
            # to avoid having to change them all around the code
         | 
| @@ -108,11 +109,12 @@ baseline_row = { | |
| 108 | 
             
            baseline_list = []
         | 
| 109 | 
             
            for task in Tasks:
         | 
| 110 | 
             
                baseline_row[task.value.col_name] = task.value.baseline
         | 
| 111 | 
            -
                if task.value.baseline is not None:
         | 
| 112 | 
             
                    baseline_list.append(task.value.baseline)
         | 
| 113 | 
             
            baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
         | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
|  | |
| 116 |  | 
| 117 | 
             
            # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
         | 
| 118 | 
             
            # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
         | 
| @@ -145,17 +147,17 @@ human_baseline_row = { | |
| 145 | 
             
                AutoEvalColumn.license.name: "",
         | 
| 146 | 
             
                AutoEvalColumn.still_on_hub.name: False,
         | 
| 147 | 
             
                AutoEvalColumn.moe.name: False,
         | 
| 148 | 
            -
                AutoEvalColumn.eval_time.name: 0.0
         | 
| 149 | 
             
            }
         | 
| 150 |  | 
| 151 | 
             
            baseline_list = []
         | 
| 152 | 
             
            for task in Tasks:
         | 
| 153 | 
             
                human_baseline_row[task.value.col_name] = task.value.human_baseline
         | 
| 154 | 
            -
                if task.value.human_baseline is not None:
         | 
| 155 | 
             
                    baseline_list.append(task.value.human_baseline)
         | 
| 156 | 
             
            human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
         | 
| 157 | 
            -
            if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
         | 
| 158 | 
            -
             | 
| 159 |  | 
| 160 | 
             
            @dataclass
         | 
| 161 | 
             
            class ModelDetails:
         | 
|  | |
| 3 | 
             
            from typing import List
         | 
| 4 | 
             
            import pandas as pd
         | 
| 5 | 
             
            from yaml import safe_load
         | 
| 6 | 
            +
            from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, TASK_CONFIG
         | 
| 7 |  | 
| 8 | 
             
            def fields(raw_class):
         | 
| 9 | 
             
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
|  | |
| 15 | 
             
                metric: str
         | 
| 16 | 
             
                col_name: str
         | 
| 17 | 
             
                baseline: float = 0.0
         | 
| 18 | 
            +
                human_baseline: float = None
         | 
| 19 | 
            +
                expert_human_baseline: float = None
         | 
| 20 | 
             
                few_shot: int = None
         | 
| 21 | 
             
                limit: int = None
         | 
| 22 | 
             
                task_list: List[str] = None
         | 
| 23 | 
             
                link: str = None
         | 
| 24 | 
             
                description: str = None
         | 
| 25 | 
            +
                sources: List[str] = None
         | 
| 26 | 
            +
                baseline_sources: List[str] = None
         | 
| 27 |  | 
| 28 | 
            +
            Tasks = Enum('Tasks', {k: Task(**v) for k, v in TASK_CONFIG['tasks'].items()})
         | 
|  | |
|  | |
| 29 |  | 
| 30 | 
             
            # These classes are for user facing column names,
         | 
| 31 | 
             
            # to avoid having to change them all around the code
         | 
|  | |
| 109 | 
             
            baseline_list = []
         | 
| 110 | 
             
            for task in Tasks:
         | 
| 111 | 
             
                baseline_row[task.value.col_name] = task.value.baseline
         | 
| 112 | 
            +
                if task.value.baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
         | 
| 113 | 
             
                    baseline_list.append(task.value.baseline)
         | 
| 114 | 
             
            baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
         | 
| 117 | 
            +
            baseline_row["🤗 Leaderboard Average"] = None
         | 
| 118 |  | 
| 119 | 
             
            # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
         | 
| 120 | 
             
            # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
         | 
|  | |
| 147 | 
             
                AutoEvalColumn.license.name: "",
         | 
| 148 | 
             
                AutoEvalColumn.still_on_hub.name: False,
         | 
| 149 | 
             
                AutoEvalColumn.moe.name: False,
         | 
| 150 | 
            +
                AutoEvalColumn.eval_time.name: 0.0,
         | 
| 151 | 
             
            }
         | 
| 152 |  | 
| 153 | 
             
            baseline_list = []
         | 
| 154 | 
             
            for task in Tasks:
         | 
| 155 | 
             
                human_baseline_row[task.value.col_name] = task.value.human_baseline
         | 
| 156 | 
            +
                if task.value.human_baseline is not None and (isinstance(task.value.baseline, float) or isinstance(task.value.baseline, int)):
         | 
| 157 | 
             
                    baseline_list.append(task.value.human_baseline)
         | 
| 158 | 
             
            human_baseline_row[AutoEvalColumn.average.name] = round(sum(baseline_list) / len(baseline_list), 2)
         | 
| 159 | 
            +
            #if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
         | 
| 160 | 
            +
            human_baseline_row["🤗 Leaderboard Average"] = None
         | 
| 161 |  | 
| 162 | 
             
            @dataclass
         | 
| 163 | 
             
            class ModelDetails:
         | 
    	
        src/envs.py
    CHANGED
    
    | @@ -1,25 +1,44 @@ | |
| 1 | 
             
            import os
         | 
|  | |
| 2 |  | 
| 3 | 
             
            from huggingface_hub import HfApi
         | 
| 4 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 | 
             
            # clone / pull the lmeh eval data
         | 
| 6 | 
            -
            H4_TOKEN =  | 
| 7 |  | 
| 8 | 
            -
            LEADERBOARD_NAME =  | 
| 9 |  | 
| 10 | 
            -
            REPO_ID =  | 
| 11 | 
            -
            QUEUE_REPO =  | 
| 12 | 
            -
            DYNAMIC_INFO_REPO =  | 
| 13 | 
            -
            RESULTS_REPO =  | 
|  | |
| 14 |  | 
| 15 | 
             
            PRIVATE_QUEUE_REPO = QUEUE_REPO
         | 
| 16 | 
             
            PRIVATE_RESULTS_REPO = RESULTS_REPO
         | 
| 17 | 
             
            #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
         | 
| 18 | 
             
            #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
         | 
| 19 |  | 
| 20 | 
            -
            IS_PUBLIC = bool( | 
| 21 |  | 
| 22 | 
            -
            CACHE_PATH= | 
| 23 |  | 
| 24 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 25 | 
             
            EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
         | 
| @@ -29,18 +48,18 @@ DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json") | |
| 29 | 
             
            EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
         | 
| 30 | 
             
            EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
         | 
| 31 |  | 
| 32 | 
            -
            PATH_TO_COLLECTION =  | 
| 33 |  | 
| 34 | 
             
            # Rate limit variables
         | 
| 35 | 
            -
            RATE_LIMIT_PERIOD = int( | 
| 36 | 
            -
            RATE_LIMIT_QUOTA = int( | 
| 37 | 
            -
            HAS_HIGHER_RATE_LIMIT =  | 
| 38 |  | 
| 39 | 
            -
            TRUST_REMOTE_CODE = bool( | 
| 40 |  | 
| 41 | 
             
            #Set if you want to get an extra field with the average eval results from the HF leaderboard
         | 
| 42 | 
            -
            GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool( | 
| 43 | 
            -
            ORIGINAL_HF_LEADERBOARD_RESULTS_REPO =  | 
| 44 | 
             
            ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
         | 
| 45 |  | 
| 46 | 
             
            API = HfApi(token=H4_TOKEN)
         | 
|  | |
| 1 | 
             
            import os
         | 
| 2 | 
            +
            from yaml import safe_load
         | 
| 3 |  | 
| 4 | 
             
            from huggingface_hub import HfApi
         | 
| 5 |  | 
| 6 | 
            +
            TASK_CONFIG_NAME = os.getenv("TASK_CONFIG", "pt_config")
         | 
| 7 | 
            +
            TASK_CONFIG_PATH = os.path.join('tasks_config', TASK_CONFIG_NAME + ".yaml")
         | 
| 8 | 
            +
            with open(TASK_CONFIG_PATH, 'r', encoding='utf-8') as f:
         | 
| 9 | 
            +
                TASK_CONFIG = safe_load(f)
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            def get_config(name, default):
         | 
| 12 | 
            +
                res = None
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                if name in os.environ:
         | 
| 15 | 
            +
                    res = os.environ[name]
         | 
| 16 | 
            +
                elif 'config' in TASK_CONFIG:
         | 
| 17 | 
            +
                    res = TASK_CONFIG['config'].get(name, None)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                if res is None:
         | 
| 20 | 
            +
                    return default
         | 
| 21 | 
            +
                return res
         | 
| 22 | 
            +
             | 
| 23 | 
             
            # clone / pull the lmeh eval data
         | 
| 24 | 
            +
            H4_TOKEN = get_config("H4_TOKEN", None)
         | 
| 25 |  | 
| 26 | 
            +
            LEADERBOARD_NAME = get_config("LEADERBOARD_NAME", "Open LLM Leaderboard")
         | 
| 27 |  | 
| 28 | 
            +
            REPO_ID = get_config("REPO_ID", "HuggingFaceH4/open_llm_leaderboard")
         | 
| 29 | 
            +
            QUEUE_REPO = get_config("QUEUE_REPO", "open-llm-leaderboard/requests")
         | 
| 30 | 
            +
            DYNAMIC_INFO_REPO = get_config("DYNAMIC_INFO_REPO", "open-llm-leaderboard/dynamic_model_information")
         | 
| 31 | 
            +
            RESULTS_REPO = get_config("RESULTS_REPO", "open-llm-leaderboard/results")
         | 
| 32 | 
            +
            RAW_RESULTS_REPO = get_config("RAW_RESgit sULTS_REPO", None)
         | 
| 33 |  | 
| 34 | 
             
            PRIVATE_QUEUE_REPO = QUEUE_REPO
         | 
| 35 | 
             
            PRIVATE_RESULTS_REPO = RESULTS_REPO
         | 
| 36 | 
             
            #PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
         | 
| 37 | 
             
            #PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
         | 
| 38 |  | 
| 39 | 
            +
            IS_PUBLIC = bool(get_config("IS_PUBLIC", True))
         | 
| 40 |  | 
| 41 | 
            +
            CACHE_PATH=get_config("HF_HOME", ".")
         | 
| 42 |  | 
| 43 | 
             
            EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
         | 
| 44 | 
             
            EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
         | 
|  | |
| 48 | 
             
            EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
         | 
| 49 | 
             
            EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
         | 
| 50 |  | 
| 51 | 
            +
            PATH_TO_COLLECTION = get_config("PATH_TO_COLLECTION", "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03")
         | 
| 52 |  | 
| 53 | 
             
            # Rate limit variables
         | 
| 54 | 
            +
            RATE_LIMIT_PERIOD = int(get_config("RATE_LIMIT_PERIOD", 7))
         | 
| 55 | 
            +
            RATE_LIMIT_QUOTA = int(get_config("RATE_LIMIT_QUOTA", 5))
         | 
| 56 | 
            +
            HAS_HIGHER_RATE_LIMIT = get_config("HAS_HIGHER_RATE_LIMIT", "TheBloke").split(',')
         | 
| 57 |  | 
| 58 | 
            +
            TRUST_REMOTE_CODE = bool(get_config("TRUST_REMOTE_CODE", False))
         | 
| 59 |  | 
| 60 | 
             
            #Set if you want to get an extra field with the average eval results from the HF leaderboard
         | 
| 61 | 
            +
            GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = bool(get_config("GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS", False))
         | 
| 62 | 
            +
            ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
         | 
| 63 | 
             
            ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
         | 
| 64 |  | 
| 65 | 
             
            API = HfApi(token=H4_TOKEN)
         | 
    	
        src/tools/plots.py
    CHANGED
    
    | @@ -99,7 +99,7 @@ def create_metric_plot_obj( | |
| 99 | 
             
                df = df[df["task"].isin(metrics)]
         | 
| 100 |  | 
| 101 | 
             
                # Filter the human baselines based on the specified metrics
         | 
| 102 | 
            -
                filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
         | 
| 103 |  | 
| 104 | 
             
                # Create a line figure using plotly express with specified markers and custom data
         | 
| 105 | 
             
                fig = px.line(
         | 
|  | |
| 99 | 
             
                df = df[df["task"].isin(metrics)]
         | 
| 100 |  | 
| 101 | 
             
                # Filter the human baselines based on the specified metrics
         | 
| 102 | 
            +
                filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}
         | 
| 103 |  | 
| 104 | 
             
                # Create a line figure using plotly express with specified markers and custom data
         | 
| 105 | 
             
                fig = px.line(
         | 
    	
        tasks_config.yaml → tasks_config/legal_config.yaml
    RENAMED
    
    | 
            File without changes
         | 
    	
        tasks_config/pt_config.yaml
    ADDED
    
    | @@ -0,0 +1,153 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version: 1.0.0
         | 
| 2 | 
            +
            config:
         | 
| 3 | 
            +
              REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
         | 
| 4 | 
            +
              QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
         | 
| 5 | 
            +
              RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
         | 
| 6 | 
            +
              RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
         | 
| 7 | 
            +
              DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
         | 
| 8 | 
            +
              PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
         | 
| 9 | 
            +
              IS_PUBLIC: true
         | 
| 10 | 
            +
              LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
         | 
| 11 | 
            +
              GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
         | 
| 12 | 
            +
              TRUST_REMOTE_CODE: true
         | 
| 13 | 
            +
            tasks:
         | 
| 14 | 
            +
              enem_challenge:
         | 
| 15 | 
            +
                benchmark: enem_challenge
         | 
| 16 | 
            +
                col_name: ENEM
         | 
| 17 | 
            +
                task_list:
         | 
| 18 | 
            +
                - enem_challenge
         | 
| 19 | 
            +
                metric: acc
         | 
| 20 | 
            +
                few_shot: 3
         | 
| 21 | 
            +
                limit: null
         | 
| 22 | 
            +
                baseline: 20.0 #random baseline
         | 
| 23 | 
            +
                #https://www.sejalguem.com/enem
         | 
| 24 | 
            +
                #https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
         | 
| 25 | 
            +
                human_baseline: 35.0 # ~60 / 180 acertos - nota  ~500
         | 
| 26 | 
            +
                expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
         | 
| 27 | 
            +
                description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
         | 
| 28 | 
            +
                  level exam widely applied every year by the Brazilian government to students that 
         | 
| 29 | 
            +
                  wish to undertake a University degree. This dataset contains 1,430 questions that don't require
         | 
| 30 | 
            +
                  image understanding of the exams from 2010 to 2018, 2022 and 2023."  
         | 
| 31 | 
            +
                link: https://huggingface.co/datasets/eduagarcia/enem_challenge
         | 
| 32 | 
            +
                sources: ["https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
         | 
| 33 | 
            +
                baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
         | 
| 34 | 
            +
              bluex:
         | 
| 35 | 
            +
                benchmark: bluex
         | 
| 36 | 
            +
                col_name: BLUEX
         | 
| 37 | 
            +
                task_list:
         | 
| 38 | 
            +
                - bluex
         | 
| 39 | 
            +
                metric: acc
         | 
| 40 | 
            +
                few_shot: 3
         | 
| 41 | 
            +
                limit: null
         | 
| 42 | 
            +
                baseline: 22.5 #random baseline
         | 
| 43 | 
            +
                #https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99 
         | 
| 44 | 
            +
                #https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4%  - ~77% @ top-.99 
         | 
| 45 | 
            +
                human_baseline: 50.0
         | 
| 46 | 
            +
                expert_human_baseline: 82.5
         | 
| 47 | 
            +
                description: "BLUEX is a multimodal dataset consisting of the two leading 
         | 
| 48 | 
            +
                university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP), 
         | 
| 49 | 
            +
                spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"   
         | 
| 50 | 
            +
                link: https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images
         | 
| 51 | 
            +
                sources: ["https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
         | 
| 52 | 
            +
                baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
         | 
| 53 | 
            +
              oab_exams:
         | 
| 54 | 
            +
                benchmark: oab_exams
         | 
| 55 | 
            +
                col_name: OAB Exams
         | 
| 56 | 
            +
                task_list:
         | 
| 57 | 
            +
                - oab_exams
         | 
| 58 | 
            +
                metric: acc
         | 
| 59 | 
            +
                few_shot: 3
         | 
| 60 | 
            +
                limit: null
         | 
| 61 | 
            +
                baseline: 25.0 #random baseline
         | 
| 62 | 
            +
                #https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
         | 
| 63 | 
            +
                # http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
         | 
| 64 | 
            +
                # Acertou +70% = 17214 / 638500 = top-97,5%
         | 
| 65 | 
            +
                # desvio top-97,5% -> 46 - 70.0% = 24 
         | 
| 66 | 
            +
                # z score 97,5% ~ 1,9675
         | 
| 67 | 
            +
                # desvio padrao estimado -> 12,2
         | 
| 68 | 
            +
                # top 99% = 46 + 2,33*12,2 = ~75.0
         | 
| 69 | 
            +
                human_baseline: 46.0
         | 
| 70 | 
            +
                expert_human_baseline: 75.0
         | 
| 71 | 
            +
                description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
         | 
| 72 | 
            +
                  Association's exams, from 2010 to 2018.
         | 
| 73 | 
            +
                link: https://huggingface.co/datasets/eduagarcia/oab_exams
         | 
| 74 | 
            +
                sources: ["https://github.com/legal-nlp/oab-exams"]
         | 
| 75 | 
            +
                baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
         | 
| 76 | 
            +
              assin2_rte:
         | 
| 77 | 
            +
                benchmark: assin2_rte
         | 
| 78 | 
            +
                col_name: ASSIN2 RTE
         | 
| 79 | 
            +
                task_list:
         | 
| 80 | 
            +
                - assin2_rte
         | 
| 81 | 
            +
                metric: f1_macro
         | 
| 82 | 
            +
                few_shot: 15
         | 
| 83 | 
            +
                limit: null
         | 
| 84 | 
            +
                baseline: 50.0 #random baseline
         | 
| 85 | 
            +
                human_baseline: null
         | 
| 86 | 
            +
                expert_human_baseline: null
         | 
| 87 | 
            +
                description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual - 
         | 
| 88 | 
            +
                Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN, 
         | 
| 89 | 
            +
                an evaluation shared task in the scope of the computational processing 
         | 
| 90 | 
            +
                of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language 
         | 
| 91 | 
            +
                Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
         | 
| 92 | 
            +
                other text (hypothesis)."
         | 
| 93 | 
            +
                link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
         | 
| 94 | 
            +
                sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
         | 
| 95 | 
            +
              assin2_sts:
         | 
| 96 | 
            +
                benchmark: assin2_sts
         | 
| 97 | 
            +
                col_name: ASSIN2 STS
         | 
| 98 | 
            +
                task_list:
         | 
| 99 | 
            +
                - assin2_sts
         | 
| 100 | 
            +
                metric: pearson
         | 
| 101 | 
            +
                few_shot: 15
         | 
| 102 | 
            +
                limit: null
         | 
| 103 | 
            +
                baseline: 0.0 #random baseline
         | 
| 104 | 
            +
                human_baseline: null
         | 
| 105 | 
            +
                expert_human_baseline: null
         | 
| 106 | 
            +
                description: "Same as dataset as above. Semantic Textual Similarity (STS) 
         | 
| 107 | 
            +
                ‘measures the degree of semantic equivalence between two sentences’."
         | 
| 108 | 
            +
                link: https://huggingface.co/datasets/eduagarcia/portuguese_benchmark
         | 
| 109 | 
            +
                sources: ["https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
         | 
| 110 | 
            +
              faquad_nli:
         | 
| 111 | 
            +
                benchmark: faquad_nli
         | 
| 112 | 
            +
                col_name: FAQUAD NLI
         | 
| 113 | 
            +
                task_list:
         | 
| 114 | 
            +
                - faquad_nli
         | 
| 115 | 
            +
                metric: f1_macro
         | 
| 116 | 
            +
                few_shot: 15
         | 
| 117 | 
            +
                limit: null
         | 
| 118 | 
            +
                baseline: 45.6 #random baseline
         | 
| 119 | 
            +
                human_baseline: null
         | 
| 120 | 
            +
                expert_human_baseline: null
         | 
| 121 | 
            +
                description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the 
         | 
| 122 | 
            +
                Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of 
         | 
| 123 | 
            +
                abundant questions sent by academics whose answers are found in available institutional 
         | 
| 124 | 
            +
                documents in the Brazilian higher education system. It consists of 900 questions about 
         | 
| 125 | 
            +
                249 reading passages taken from 18 official documents of a computer science college
         | 
| 126 | 
            +
                from a Brazilian federal university and 21 Wikipedia articles related to the 
         | 
| 127 | 
            +
                Brazilian higher education system. FaQuAD-NLI is a modified version of the 
         | 
| 128 | 
            +
                FaQuAD dataset that repurposes the question answering task as a textual 
         | 
| 129 | 
            +
                entailment task between a question and its possible answers."
         | 
| 130 | 
            +
                link: https://huggingface.co/datasets/ruanchaves/faquad-nli
         | 
| 131 | 
            +
                sources: ["https://github.com/liafacom/faquad/"]
         | 
| 132 | 
            +
              sparrow_pt:
         | 
| 133 | 
            +
                benchmark: sparrow_pt
         | 
| 134 | 
            +
                col_name: Sparrow POR
         | 
| 135 | 
            +
                task_list:
         | 
| 136 | 
            +
                - sparrow_emotion-2021-cortiz-por
         | 
| 137 | 
            +
                - sparrow_hate-2019-fortuna-por
         | 
| 138 | 
            +
                - sparrow_sentiment-2016-mozetic-por
         | 
| 139 | 
            +
                - sparrow_sentiment-2018-brum-por
         | 
| 140 | 
            +
                metric: f1_macro
         | 
| 141 | 
            +
                few_shot: 15
         | 
| 142 | 
            +
                limit: 500
         | 
| 143 | 
            +
                baseline: 29.5 #random baseline [3.3, 48.8, 33.1, 33.0]
         | 
| 144 | 
            +
                human_baseline: null
         | 
| 145 | 
            +
                expert_human_baseline: null
         | 
| 146 | 
            +
                description: "SPARROW is a multilingual evaluation benchmark for sociopragmatic meaning understanding. 
         | 
| 147 | 
            +
                SPARROW comprises 169 datasets encompassing 64 different languages, 
         | 
| 148 | 
            +
                this split evaluates only on the validation set of 4 datasets avaliable for the Portuguese language.
         | 
| 149 | 
            +
                One on hate speech detection by Fortuna et al. (2019), one on emotion detection by Cortiz et al. (2021) 
         | 
| 150 | 
            +
                and two on sentiment analysis by Mozetic et al. (2016) and Brum et al. (2018).
         | 
| 151 | 
            +
                All were extracted and manually annotated from Twitter/X."
         | 
| 152 | 
            +
                link: https://huggingface.co/datasets/UBC-NLP/sparrow
         | 
| 153 | 
            +
                sources: ["https://sparrow.dlnlp.ai/", "https://aclanthology.org/W19-3510/", "https://arxiv.org/abs/2108.07638", "https://aclanthology.org/L18-1658/", "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0155036"]
         | 
 
			
