Added computation and display of the standard deviation across individual prompt accuracy values for each task
Browse files- app.py +7 -5
- preprocess_models_output.py +2 -0
- src/about.py +57 -47
- src/populate.py +1 -0
- src/tasks.py +3 -3
    	
        app.py
    CHANGED
    
    | @@ -143,6 +143,7 @@ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) | |
| 143 | 
             
            # Load leaderboard data
         | 
| 144 | 
             
            LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         | 
| 145 | 
             
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
         | 
|  | |
| 146 |  | 
| 147 | 
             
            # Prepare the main interface
         | 
| 148 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
| @@ -194,9 +195,9 @@ with demo: | |
| 194 | 
             
                            gr.Markdown(task_description, elem_classes="markdown-text")
         | 
| 195 |  | 
| 196 | 
             
                            leaderboard = update_task_leaderboard(
         | 
| 197 | 
            -
                                LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
         | 
| 198 | 
            -
                                default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
         | 
| 199 | 
            -
                                hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
         | 
| 200 | 
             
                            )
         | 
| 201 |  | 
| 202 | 
             
                    # About tab
         | 
| @@ -211,13 +212,14 @@ with demo: | |
| 211 |  | 
| 212 | 
             
                            leaderboard = update_task_leaderboard(
         | 
| 213 | 
             
                                LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
         | 
|  | |
| 214 | 
             
                                                               f"{task} Best Prompt": "Best Prompt",
         | 
| 215 | 
             
                                                               f"{task} Best Prompt Id": "Best Prompt Id",
         | 
| 216 | 
             
                                                               task: "Combined Performance"}),
         | 
| 217 | 
            -
                                default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
         | 
| 218 | 
             
                                                   'Best Prompt Id'],
         | 
| 219 | 
             
                                hidden_columns=[col for col in LEADERBOARD_DF.columns if
         | 
| 220 | 
            -
                                                col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average',
         | 
| 221 | 
             
                                                            'Best Prompt', 'Best Prompt Id']]
         | 
| 222 | 
             
                            )
         | 
| 223 |  | 
|  | |
| 143 | 
             
            # Load leaderboard data
         | 
| 144 | 
             
            LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
         | 
| 145 | 
             
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
         | 
| 146 | 
            +
            print(LEADERBOARD_DF.columns.tolist())
         | 
| 147 |  | 
| 148 | 
             
            # Prepare the main interface
         | 
| 149 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
|  | |
| 195 | 
             
                            gr.Markdown(task_description, elem_classes="markdown-text")
         | 
| 196 |  | 
| 197 | 
             
                            leaderboard = update_task_leaderboard(
         | 
| 198 | 
            +
                                LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
         | 
| 199 | 
            +
                                default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
         | 
| 200 | 
            +
                                hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
         | 
| 201 | 
             
                            )
         | 
| 202 |  | 
| 203 | 
             
                    # About tab
         | 
|  | |
| 212 |  | 
| 213 | 
             
                            leaderboard = update_task_leaderboard(
         | 
| 214 | 
             
                                LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
         | 
| 215 | 
            +
                                                               f"{task} Prompt Std": "Prompt Std",
         | 
| 216 | 
             
                                                               f"{task} Best Prompt": "Best Prompt",
         | 
| 217 | 
             
                                                               f"{task} Best Prompt Id": "Best Prompt Id",
         | 
| 218 | 
             
                                                               task: "Combined Performance"}),
         | 
| 219 | 
            +
                                default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
         | 
| 220 | 
             
                                                   'Best Prompt Id'],
         | 
| 221 | 
             
                                hidden_columns=[col for col in LEADERBOARD_DF.columns if
         | 
| 222 | 
            +
                                                col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
         | 
| 223 | 
             
                                                            'Best Prompt', 'Best Prompt Id']]
         | 
| 224 | 
             
                            )
         | 
| 225 |  | 
    	
        preprocess_models_output.py
    CHANGED
    
    | @@ -73,6 +73,7 @@ Evaluation Report (.json format): | |
| 73 | 
             
            import json
         | 
| 74 | 
             
            import os
         | 
| 75 | 
             
            import re
         | 
|  | |
| 76 |  | 
| 77 | 
             
            def safe_float(value):
         | 
| 78 | 
             
                """Safely converts a value to float, returning None if the conversion fails."""
         | 
| @@ -90,6 +91,7 @@ def calculate_task_metrics(task_info): | |
| 90 | 
             
                    return None
         | 
| 91 |  | 
| 92 | 
             
                task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
         | 
|  | |
| 93 | 
             
                best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
         | 
| 94 | 
             
                task_info['best_prompt'] = best_prompt_data['value']
         | 
| 95 | 
             
                task_info['prompt_id'] = best_prompt_data['prompt']
         | 
|  | |
| 73 | 
             
            import json
         | 
| 74 | 
             
            import os
         | 
| 75 | 
             
            import re
         | 
| 76 | 
            +
            import statistics
         | 
| 77 |  | 
| 78 | 
             
            def safe_float(value):
         | 
| 79 | 
             
                """Safely converts a value to float, returning None if the conversion fails."""
         | 
|  | |
| 91 | 
             
                    return None
         | 
| 92 |  | 
| 93 | 
             
                task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
         | 
| 94 | 
            +
                task_info['std_accuracy'] = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0
         | 
| 95 | 
             
                best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
         | 
| 96 | 
             
                task_info['best_prompt'] = best_prompt_data['value']
         | 
| 97 | 
             
                task_info['prompt_id'] = best_prompt_data['prompt']
         | 
    	
        src/about.py
    CHANGED
    
    | @@ -15,53 +15,63 @@ class Tasks(Enum): | |
| 15 |  | 
| 16 | 
             
                task1 = Task("text-entailment_1", "acc", "CPS", "TE")
         | 
| 17 | 
             
                task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average")
         | 
| 18 | 
            -
                task3 = Task("text-entailment_3", "acc", " | 
| 19 | 
            -
                task4 = Task("text-entailment_4", "acc", " | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
                task6 = Task("sentiment- | 
| 23 | 
            -
                task7 = Task("sentiment- | 
| 24 | 
            -
                task8 = Task("sentiment- | 
| 25 | 
            -
             | 
| 26 | 
            -
                 | 
| 27 | 
            -
             | 
| 28 | 
            -
                task11 = Task("hate-speech- | 
| 29 | 
            -
                task12 = Task("hate-speech- | 
| 30 | 
            -
             | 
| 31 | 
            -
                 | 
| 32 | 
            -
                 | 
| 33 | 
            -
             | 
| 34 | 
            -
                task16 = Task("admission- | 
| 35 | 
            -
             | 
| 36 | 
            -
                 | 
| 37 | 
            -
                 | 
| 38 | 
            -
                 | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
                 | 
| 42 | 
            -
                 | 
| 43 | 
            -
                 | 
| 44 | 
            -
                 | 
| 45 | 
            -
             | 
| 46 | 
            -
                 | 
| 47 | 
            -
                 | 
| 48 | 
            -
                 | 
| 49 | 
            -
                 | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
                 | 
| 53 | 
            -
                 | 
| 54 | 
            -
                 | 
| 55 | 
            -
             | 
| 56 | 
            -
                 | 
| 57 | 
            -
             | 
| 58 | 
            -
                 | 
| 59 | 
            -
                 | 
| 60 | 
            -
             | 
| 61 | 
            -
                 | 
| 62 | 
            -
                 | 
| 63 | 
            -
             | 
| 64 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 65 |  | 
| 66 | 
             
                '''
         | 
| 67 | 
             
                task0 = Task("TextualEntailment", "acc", "Textual Entailment")
         | 
|  | |
| 15 |  | 
| 16 | 
             
                task1 = Task("text-entailment_1", "acc", "CPS", "TE")
         | 
| 17 | 
             
                task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average")
         | 
| 18 | 
            +
                task3 = Task("text-entailment_3", "acc", "std_accuracy", "TE Prompt Std")
         | 
| 19 | 
            +
                task4 = Task("text-entailment_4", "acc", "best_prompt", "TE Best Prompt")
         | 
| 20 | 
            +
                task5 = Task("text-entailment_5", "acc", "prompt_id", "TE Best Prompt Id")
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                task6 = Task("sentiment-analysis_1", "acc", "CPS", "SA")
         | 
| 23 | 
            +
                task7 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average")
         | 
| 24 | 
            +
                task8 = Task("sentiment-analysis_3", "acc", "std_accuracy", "SA STD Accuracy")
         | 
| 25 | 
            +
                task9 = Task("sentiment-analysis_4", "acc", "best_prompt", "SA Best Prompt")
         | 
| 26 | 
            +
                task10 = Task("sentiment-analysis_5", "acc", "prompt_id", "SA Best Prompt Id")
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                task11 = Task("hate-speech-detection_1", "acc", "CPS", "HS")
         | 
| 29 | 
            +
                task12 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average")
         | 
| 30 | 
            +
                task13 = Task("hate-speech-detection_3", "acc", "std_accuracy", "HS Prompt Std")
         | 
| 31 | 
            +
                task14 = Task("hate-speech-detection_4", "acc", "best_prompt", "HS Best Prompt")
         | 
| 32 | 
            +
                task15 = Task("hate-speech-detection_5", "acc", "prompt_id", "HS Best Prompt Id")
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                task16 = Task("admission-test_1", "acc", "CPS", "AT")
         | 
| 35 | 
            +
                task17 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average")
         | 
| 36 | 
            +
                task18 = Task("admission-test_3", "acc", "std_accuracy", "AT Prompt Std")
         | 
| 37 | 
            +
                task19 = Task("admission-test_4", "acc", "best_prompt", "AT Best Prompt")
         | 
| 38 | 
            +
                task20 = Task("admission-test_5", "acc", "prompt_id", "AT Best Prompt Id")
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                task21 = Task("word-in-context_1", "acc", "CPS", "WIC")
         | 
| 41 | 
            +
                task22 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average")
         | 
| 42 | 
            +
                task23 = Task("word-in-context_3", "acc", "std_accuracy", "WIC Prompt Std")
         | 
| 43 | 
            +
                task24 = Task("word-in-context_4", "acc", "best_prompt", "WIC Best Prompt")
         | 
| 44 | 
            +
                task25 = Task("word-in-context_5", "acc", "prompt_id", "WIC Best Prompt Id")
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                task26 = Task("faq_1", "acc", "CPS", "FAQ")
         | 
| 47 | 
            +
                task27 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average")
         | 
| 48 | 
            +
                task28 = Task("faq_3", "acc", "std_accuracy", "FAQ Prompt Std")
         | 
| 49 | 
            +
                task29 = Task("faq_4", "acc", "best_prompt", "FAQ Best Prompt")
         | 
| 50 | 
            +
                task30 = Task("faq_5", "acc", "prompt_id", "FAQ Best Prompt Id")
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                task31 = Task("lexical-substitution_1", "acc", "CPS", "LS")
         | 
| 53 | 
            +
                task32 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average")
         | 
| 54 | 
            +
                task33 = Task("lexical-substitution_3", "acc", "std_accuracy", "LS Prompt Std")
         | 
| 55 | 
            +
                task34 = Task("lexical-substitution_4", "acc", "best_prompt", "LS Best Prompt")
         | 
| 56 | 
            +
                task35 = Task("lexical-substitution_5", "acc", "prompt_id", "LS Best Prompt Id")
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                task36 = Task("summarization-fanpage_1", "acc", "CPS", "SU")
         | 
| 59 | 
            +
                task37 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average")
         | 
| 60 | 
            +
                task38 = Task("summarization-fanpage_3", "acc", "std_accuracy", "SU Prompt Std")
         | 
| 61 | 
            +
                task39 = Task("summarization-fanpage_4", "acc", "best_prompt", "SU Best Prompt")
         | 
| 62 | 
            +
                task40 = Task("summarization-fanpage_5", "acc", "prompt_id", "SU Best Prompt Id")
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                task41 = Task("evalita NER_1", "acc", "CPS", "NER")
         | 
| 65 | 
            +
                task42 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average")
         | 
| 66 | 
            +
                task43 = Task("evalita NER_3", "acc", "std_accuracy", "NER Prompt Std")
         | 
| 67 | 
            +
                task44 = Task("evalita NER_4", "acc", "best_prompt", "NER Best Prompt")
         | 
| 68 | 
            +
                task45 = Task("evalita NER_5", "acc", "prompt_id", "NER Best Prompt Id")
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                task46 = Task("relation-extraction_1", "acc", "CPS", "REL")
         | 
| 71 | 
            +
                task47 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average")
         | 
| 72 | 
            +
                task48 = Task("relation-extraction_5", "acc", "std_accuracy", "REL Prompt Std")
         | 
| 73 | 
            +
                task49 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
         | 
| 74 | 
            +
                task50 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
         | 
| 75 |  | 
| 76 | 
             
                '''
         | 
| 77 | 
             
                task0 = Task("TextualEntailment", "acc", "Textual Entailment")
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -16,6 +16,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm | |
| 16 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 17 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| 18 | 
             
                df = df[cols].round(decimals=2)
         | 
|  | |
| 19 |  | 
| 20 | 
             
                # filter out if any of the benchmarks have not been produced
         | 
| 21 | 
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         | 
|  | |
| 16 | 
             
                df = pd.DataFrame.from_records(all_data_json)
         | 
| 17 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| 18 | 
             
                df = df[cols].round(decimals=2)
         | 
| 19 | 
            +
                #df.to_csv("output.csv", index=False)
         | 
| 20 |  | 
| 21 | 
             
                # filter out if any of the benchmarks have not been produced
         | 
| 22 | 
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         | 
    	
        src/tasks.py
    CHANGED
    
    | @@ -63,8 +63,8 @@ HS_DESCRIPTION = """### Hate Speech (HS) --- *Multiple-choice task* | |
| 63 | 
             
            |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
         | 
| 64 | 
             
            | 1   | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?                     | ["Falso", "Vero"] |
         | 
| 65 | 
             
            | 2   | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
         | 
| 66 | 
            -
            | 3   | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: | ["B", "A"]                                      |
         | 
| 67 | 
            -
            | 4   | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: | ["B", "A"]                                      |
         | 
| 68 | 
             
            | 5   | Il tweet: '{{full_text}}'                                                      | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
         | 
| 69 | 
             
            | 6   | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
         | 
| 70 |  | 
| @@ -81,7 +81,7 @@ AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task* | |
| 81 | 
             
            | 2   | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"]   |
         | 
| 82 | 
             
            | 3   | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"]   |
         | 
| 83 | 
             
            | 4   | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"]   |
         | 
| 84 | 
            -
            | 5   | Dato il seguente  | 
| 85 | 
             
            | 6   | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"]   |
         | 
| 86 |  | 
| 87 | 
             
            <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
         | 
|  | |
| 63 | 
             
            |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
         | 
| 64 | 
             
            | 1   | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?                     | ["Falso", "Vero"] |
         | 
| 65 | 
             
            | 2   | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
         | 
| 66 | 
            +
            | 3   | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: Vero\\nB: Falso\\nRisposta: | ["B", "A"]                                      |
         | 
| 67 | 
            +
            | 4   | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: Vero\\nB: Falso\\nRisposta: | ["B", "A"]                                      |
         | 
| 68 | 
             
            | 5   | Il tweet: '{{full_text}}'                                                      | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
         | 
| 69 | 
             
            | 6   | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
         | 
| 70 |  | 
|  | |
| 81 | 
             
            | 2   | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"]   |
         | 
| 82 | 
             
            | 3   | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"]   |
         | 
| 83 | 
             
            | 4   | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"]   |
         | 
| 84 | 
            +
            | 5   | Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"]   |
         | 
| 85 | 
             
            | 6   | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"]   |
         | 
| 86 |  | 
| 87 | 
             
            <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
         |