Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Commit 
							
							Β·
						
						91c6e89
	
1
								Parent(s):
							
							c8b2c09
								
change 'proprietary' models to 'external' models and added news models
Browse files
    	
        proprietary_models_results.json β external_models_results.json
    RENAMED
    
    | @@ -6,6 +6,7 @@ | |
| 6 | 
             
                    "date": "2024-04-12",
         | 
| 7 | 
             
                    "status": "full",
         | 
| 8 | 
             
                    "main_language": "Portuguese",
         | 
|  | |
| 9 | 
             
                    "result_metrics": {
         | 
| 10 | 
             
                        "enem_challenge": 0.7172848145556333,
         | 
| 11 | 
             
                        "bluex": 0.5549374130737135,
         | 
| @@ -27,6 +28,7 @@ | |
| 27 | 
             
                    "date": "2024-04-13",
         | 
| 28 | 
             
                    "status": "full",
         | 
| 29 | 
             
                    "main_language": "Portuguese",
         | 
|  | |
| 30 | 
             
                    "result_metrics": {
         | 
| 31 | 
             
                        "enem_challenge": 0.8180545836249126,
         | 
| 32 | 
             
                        "bluex": 0.717663421418637,
         | 
| @@ -48,6 +50,7 @@ | |
| 48 | 
             
                    "date": "2024-03-08",
         | 
| 49 | 
             
                    "status": "full",
         | 
| 50 | 
             
                    "main_language": "English",
         | 
|  | |
| 51 | 
             
                    "result_metrics": {
         | 
| 52 | 
             
                        "enem_challenge": 0.7214835549335199,
         | 
| 53 | 
             
                        "bluex": 0.6244784422809457,
         | 
| @@ -69,6 +72,7 @@ | |
| 69 | 
             
                    "date": "2024-04-13",
         | 
| 70 | 
             
                    "status": "full",
         | 
| 71 | 
             
                    "main_language": "English",
         | 
|  | |
| 72 | 
             
                    "result_metrics": {
         | 
| 73 | 
             
                        "enem_challenge": 0.7718684394681595,
         | 
| 74 | 
             
                        "bluex": 0.6662030598052852,
         | 
| @@ -90,6 +94,7 @@ | |
| 90 | 
             
                    "date": "2024-03-08",
         | 
| 91 | 
             
                    "status": "full",
         | 
| 92 | 
             
                    "main_language": "English",
         | 
|  | |
| 93 | 
             
                    "result_metrics": {
         | 
| 94 | 
             
                        "enem_challenge": 0.7130860741777467,
         | 
| 95 | 
             
                        "bluex": 0.5869262865090403,
         | 
| @@ -111,6 +116,7 @@ | |
| 111 | 
             
                    "date": "2024-04-15",
         | 
| 112 | 
             
                    "status": "full",
         | 
| 113 | 
             
                    "main_language": "English",
         | 
|  | |
| 114 | 
             
                    "result_metrics": {
         | 
| 115 | 
             
                        "enem_challenge": 0.8509447165850245,
         | 
| 116 | 
             
                        "bluex": 0.7719054242002782,
         | 
| @@ -132,6 +138,7 @@ | |
| 132 | 
             
                    "date": "2024-05-18",
         | 
| 133 | 
             
                    "status": "full",
         | 
| 134 | 
             
                    "main_language": "English",
         | 
|  | |
| 135 | 
             
                    "result_metrics": {
         | 
| 136 | 
             
                        "enem_challenge": 0.7844646606018194,
         | 
| 137 | 
             
                        "bluex": 0.6954102920723226,
         | 
| @@ -153,6 +160,7 @@ | |
| 153 | 
             
                    "date": "2024-05-18",
         | 
| 154 | 
             
                    "status": "full",
         | 
| 155 | 
             
                    "main_language": "English",
         | 
|  | |
| 156 | 
             
                    "result_metrics": {
         | 
| 157 | 
             
                        "enem_challenge": 0.8264520643806857,
         | 
| 158 | 
             
                        "bluex": 0.7482614742698191,
         | 
| @@ -166,5 +174,72 @@ | |
| 166 | 
             
                    },
         | 
| 167 | 
             
                    "result_metrics_average": 0.7914657682594597,
         | 
| 168 | 
             
                    "result_metrics_npm": 0.6834036936130392
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 169 | 
             
                }
         | 
| 170 | 
             
            ]
         | 
|  | |
| 6 | 
             
                    "date": "2024-04-12",
         | 
| 7 | 
             
                    "status": "full",
         | 
| 8 | 
             
                    "main_language": "Portuguese",
         | 
| 9 | 
            +
                    "model_type": "proprietary",
         | 
| 10 | 
             
                    "result_metrics": {
         | 
| 11 | 
             
                        "enem_challenge": 0.7172848145556333,
         | 
| 12 | 
             
                        "bluex": 0.5549374130737135,
         | 
|  | |
| 28 | 
             
                    "date": "2024-04-13",
         | 
| 29 | 
             
                    "status": "full",
         | 
| 30 | 
             
                    "main_language": "Portuguese",
         | 
| 31 | 
            +
                    "model_type": "proprietary",
         | 
| 32 | 
             
                    "result_metrics": {
         | 
| 33 | 
             
                        "enem_challenge": 0.8180545836249126,
         | 
| 34 | 
             
                        "bluex": 0.717663421418637,
         | 
|  | |
| 50 | 
             
                    "date": "2024-03-08",
         | 
| 51 | 
             
                    "status": "full",
         | 
| 52 | 
             
                    "main_language": "English",
         | 
| 53 | 
            +
                    "model_type": "proprietary",
         | 
| 54 | 
             
                    "result_metrics": {
         | 
| 55 | 
             
                        "enem_challenge": 0.7214835549335199,
         | 
| 56 | 
             
                        "bluex": 0.6244784422809457,
         | 
|  | |
| 72 | 
             
                    "date": "2024-04-13",
         | 
| 73 | 
             
                    "status": "full",
         | 
| 74 | 
             
                    "main_language": "English",
         | 
| 75 | 
            +
                    "model_type": "proprietary",
         | 
| 76 | 
             
                    "result_metrics": {
         | 
| 77 | 
             
                        "enem_challenge": 0.7718684394681595,
         | 
| 78 | 
             
                        "bluex": 0.6662030598052852,
         | 
|  | |
| 94 | 
             
                    "date": "2024-03-08",
         | 
| 95 | 
             
                    "status": "full",
         | 
| 96 | 
             
                    "main_language": "English",
         | 
| 97 | 
            +
                    "model_type": "proprietary",
         | 
| 98 | 
             
                    "result_metrics": {
         | 
| 99 | 
             
                        "enem_challenge": 0.7130860741777467,
         | 
| 100 | 
             
                        "bluex": 0.5869262865090403,
         | 
|  | |
| 116 | 
             
                    "date": "2024-04-15",
         | 
| 117 | 
             
                    "status": "full",
         | 
| 118 | 
             
                    "main_language": "English",
         | 
| 119 | 
            +
                    "model_type": "proprietary",
         | 
| 120 | 
             
                    "result_metrics": {
         | 
| 121 | 
             
                        "enem_challenge": 0.8509447165850245,
         | 
| 122 | 
             
                        "bluex": 0.7719054242002782,
         | 
|  | |
| 138 | 
             
                    "date": "2024-05-18",
         | 
| 139 | 
             
                    "status": "full",
         | 
| 140 | 
             
                    "main_language": "English",
         | 
| 141 | 
            +
                    "model_type": "proprietary",
         | 
| 142 | 
             
                    "result_metrics": {
         | 
| 143 | 
             
                        "enem_challenge": 0.7844646606018194,
         | 
| 144 | 
             
                        "bluex": 0.6954102920723226,
         | 
|  | |
| 160 | 
             
                    "date": "2024-05-18",
         | 
| 161 | 
             
                    "status": "full",
         | 
| 162 | 
             
                    "main_language": "English",
         | 
| 163 | 
            +
                    "model_type": "proprietary",
         | 
| 164 | 
             
                    "result_metrics": {
         | 
| 165 | 
             
                        "enem_challenge": 0.8264520643806857,
         | 
| 166 | 
             
                        "bluex": 0.7482614742698191,
         | 
|  | |
| 174 | 
             
                    },
         | 
| 175 | 
             
                    "result_metrics_average": 0.7914657682594597,
         | 
| 176 | 
             
                    "result_metrics_npm": 0.6834036936130392
         | 
| 177 | 
            +
                },
         | 
| 178 | 
            +
                {
         | 
| 179 | 
            +
                    "model": "gemini-1.5-flash",
         | 
| 180 | 
            +
                    "name": "Gemini 1.5 Flash",
         | 
| 181 | 
            +
                    "link": "https://cloud.google.com/vertex-ai",
         | 
| 182 | 
            +
                    "date": "2024-08-09",
         | 
| 183 | 
            +
                    "status": "full",
         | 
| 184 | 
            +
                    "main_language": "English",
         | 
| 185 | 
            +
                    "model_type": "proprietary",
         | 
| 186 | 
            +
                    "result_metrics": {
         | 
| 187 | 
            +
                        "enem_challenge": 0.8306508047585724,
         | 
| 188 | 
            +
                        "bluex": 0.7579972183588317,
         | 
| 189 | 
            +
                        "oab_exams": 0.6446469248291572,
         | 
| 190 | 
            +
                        "assin2_sts": 0.838806085610371,
         | 
| 191 | 
            +
                        "assin2_rte": 0.9366169973822607,
         | 
| 192 | 
            +
                        "faquad_nli": 0.7963910785668922,
         | 
| 193 | 
            +
                        "hatebr_offensive": 0.9092078461170015,
         | 
| 194 | 
            +
                        "portuguese_hate_speech": 0.6932563987219857,
         | 
| 195 | 
            +
                        "tweetsentbr": 0.7312948963367732
         | 
| 196 | 
            +
                    },
         | 
| 197 | 
            +
                    "result_metrics_average": 0.7932075834090939,
         | 
| 198 | 
            +
                    "result_metrics_npm": 0.6855338135928848
         | 
| 199 | 
            +
                },
         | 
| 200 | 
            +
                {
         | 
| 201 | 
            +
                    "model": "gpt-4o-mini-2024-07-18",
         | 
| 202 | 
            +
                    "name": "GPT 4o Mini (2024-07-18)",
         | 
| 203 | 
            +
                    "link": "https://www.openai.com/",
         | 
| 204 | 
            +
                    "date": "2024-07-25",
         | 
| 205 | 
            +
                    "status": "full",
         | 
| 206 | 
            +
                    "main_language": "English",
         | 
| 207 | 
            +
                    "model_type": "proprietary",
         | 
| 208 | 
            +
                    "result_metrics": {
         | 
| 209 | 
            +
                        "enem_challenge": 0.7669699090272918,
         | 
| 210 | 
            +
                        "bluex": 0.6842837273991655,
         | 
| 211 | 
            +
                        "oab_exams": 0.6013667425968109,
         | 
| 212 | 
            +
                        "assin2_sts": 0.7259038954527597,
         | 
| 213 | 
            +
                        "assin2_rte": 0.942809846745341,
         | 
| 214 | 
            +
                        "faquad_nli": 0.819807735300693,
         | 
| 215 | 
            +
                        "hatebr_offensive": 0.8682357029532165,
         | 
| 216 | 
            +
                        "portuguese_hate_speech": 0.7501413502853012,
         | 
| 217 | 
            +
                        "tweetsentbr": 0.7509303825869922
         | 
| 218 | 
            +
                    },
         | 
| 219 | 
            +
                    "result_metrics_average": 0.7678276991497301,
         | 
| 220 | 
            +
                    "result_metrics_npm": 0.6595966999910003
         | 
| 221 | 
            +
                },
         | 
| 222 | 
            +
                {
         | 
| 223 | 
            +
                    "model": "nemotron-4-340b-instruct",
         | 
| 224 | 
            +
                    "name": "nvidia/Nemotron-4-340B-Instruct (Nvidia API)",
         | 
| 225 | 
            +
                    "link": "https://build.nvidia.com/nvidia/nemotron-4-340b-instruct",
         | 
| 226 | 
            +
                    "date": "2024-06-30",
         | 
| 227 | 
            +
                    "status": "full",
         | 
| 228 | 
            +
                    "main_language": "English",
         | 
| 229 | 
            +
                    "model_type": "chat",
         | 
| 230 | 
            +
                    "params": 340.0,
         | 
| 231 | 
            +
                    "result_metrics": {
         | 
| 232 | 
            +
                        "enem_challenge": 0.6648005598320503,
         | 
| 233 | 
            +
                        "bluex": 0.6578581363004172,
         | 
| 234 | 
            +
                        "oab_exams": 0.7020501138952164,
         | 
| 235 | 
            +
                        "assin2_sts": 0.7857731021403329,
         | 
| 236 | 
            +
                        "assin2_rte": 0.9489354458928496,
         | 
| 237 | 
            +
                        "faquad_nli": 0.8194444444444444,
         | 
| 238 | 
            +
                        "hatebr_offensive": 0.8641580001234928,
         | 
| 239 | 
            +
                        "portuguese_hate_speech": 0.7761835184102864,
         | 
| 240 | 
            +
                        "tweetsentbr": 0.780880021326841
         | 
| 241 | 
            +
                    },
         | 
| 242 | 
            +
                    "result_metrics_average": 0.7777870380406591,
         | 
| 243 | 
            +
                    "result_metrics_npm": 0.6740728488043128
         | 
| 244 | 
             
                }
         | 
| 245 | 
             
            ]
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -166,24 +166,30 @@ human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2) | |
| 166 | 
             
            if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
         | 
| 167 | 
             
                human_baseline_row["π€ Leaderboard Average"] = None
         | 
| 168 |  | 
| 169 | 
            -
            # | 
| 170 | 
            -
             | 
| 171 | 
            -
            if os.path.exists(' | 
| 172 | 
            -
                with open(' | 
| 173 | 
             
                    all_models = json.load(f)
         | 
| 174 | 
             
                for model_data in all_models:
         | 
| 175 | 
             
                    model_row = deepcopy(baseline_row)
         | 
| 176 | 
             
                    model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
         | 
| 177 | 
             
                    model_row[AutoEvalColumn.dummy.name] = model_data['model']
         | 
| 178 | 
            -
                    model_row[AutoEvalColumn.license.name] = "Proprietary"
         | 
| 179 | 
             
                    for task in Tasks:
         | 
| 180 | 
             
                        model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
         | 
| 181 | 
             
                    model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
         | 
| 182 | 
             
                    model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
         | 
| 183 | 
            -
                     | 
| 184 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 185 | 
             
                    model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
         | 
| 186 | 
            -
                     | 
| 187 |  | 
| 188 | 
             
            @dataclass
         | 
| 189 | 
             
            class ModelDetails:
         | 
|  | |
| 166 | 
             
            if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
         | 
| 167 | 
             
                human_baseline_row["π€ Leaderboard Average"] = None
         | 
| 168 |  | 
| 169 | 
            +
            #External models
         | 
| 170 | 
            +
            external_rows = []
         | 
| 171 | 
            +
            if os.path.exists('external_models_results.json'):
         | 
| 172 | 
            +
                with open('external_models_results.json', 'r', encoding='utf8') as f:
         | 
| 173 | 
             
                    all_models = json.load(f)
         | 
| 174 | 
             
                for model_data in all_models:
         | 
| 175 | 
             
                    model_row = deepcopy(baseline_row)
         | 
| 176 | 
             
                    model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
         | 
| 177 | 
             
                    model_row[AutoEvalColumn.dummy.name] = model_data['model']
         | 
|  | |
| 178 | 
             
                    for task in Tasks:
         | 
| 179 | 
             
                        model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
         | 
| 180 | 
             
                    model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
         | 
| 181 | 
             
                    model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
         | 
| 182 | 
            +
                    
         | 
| 183 | 
            +
                    model_type = ModelType.from_str(model_data['model_type'])
         | 
| 184 | 
            +
                    model_row[AutoEvalColumn.model_type.name] = model_type.name
         | 
| 185 | 
            +
                    model_row[AutoEvalColumn.model_type_symbol.name] = model_type.symbol
         | 
| 186 | 
            +
                    if model_type == ModelType.proprietary:
         | 
| 187 | 
            +
                        model_row[AutoEvalColumn.license.name] = "Proprietary"
         | 
| 188 | 
            +
                    if 'params' in model_data:
         | 
| 189 | 
            +
                        model_row[AutoEvalColumn.params.name] = model_data['params']
         | 
| 190 | 
            +
             | 
| 191 | 
             
                    model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
         | 
| 192 | 
            +
                    external_rows.append(model_row)
         | 
| 193 |  | 
| 194 | 
             
            @dataclass
         | 
| 195 | 
             
            class ModelDetails:
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -5,7 +5,7 @@ import copy | |
| 5 | 
             
            import pandas as pd
         | 
| 6 |  | 
| 7 | 
             
            from src.display.formatting import has_no_nan_values, make_requests_clickable_model
         | 
| 8 | 
            -
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row,  | 
| 9 | 
             
            from src.leaderboard.filter_models import filter_models_flags
         | 
| 10 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 11 |  | 
| @@ -14,8 +14,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, | |
| 14 | 
             
                raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
         | 
| 15 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
| 16 | 
             
                all_data_json.append(baseline_row)
         | 
| 17 | 
            -
                for  | 
| 18 | 
            -
                    all_data_json.append( | 
| 19 | 
             
                filter_models_flags(all_data_json)
         | 
| 20 |  | 
| 21 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
|  | |
| 5 | 
             
            import pandas as pd
         | 
| 6 |  | 
| 7 | 
             
            from src.display.formatting import has_no_nan_values, make_requests_clickable_model
         | 
| 8 | 
            +
            from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, external_rows
         | 
| 9 | 
             
            from src.leaderboard.filter_models import filter_models_flags
         | 
| 10 | 
             
            from src.leaderboard.read_evals import get_raw_eval_results
         | 
| 11 |  | 
|  | |
| 14 | 
             
                raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
         | 
| 15 | 
             
                all_data_json = [v.to_dict() for v in raw_data]
         | 
| 16 | 
             
                all_data_json.append(baseline_row)
         | 
| 17 | 
            +
                for external_row in external_rows:
         | 
| 18 | 
            +
                    all_data_json.append(external_row)
         | 
| 19 | 
             
                filter_models_flags(all_data_json)
         | 
| 20 |  | 
| 21 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
 
			
