evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Mar 27

Commit

602e1b0

1 Parent(s): 7aacef3

Small Changes

Browse files

Files changed (2) hide show

src/about.py +2 -2
src/tasks.py +10 -10

src/about.py CHANGED Viewed

@@ -93,8 +93,8 @@ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀
 INTRODUCTION_TEXT = """
 Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding translation issues and potential cultural biases; (ii) the benchmark includes **generative** tasks, enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
-**<small>Multiple Choice:</small>** <small> 📊TE (Textual Entailment), 😃SA (Sentiment Analysis), ⚠️HS (Hate Speech Detection), 🏥AT (Admission Test), 🔤WIC (Word in Context), ❓FAQ (Frequently Asked Questions) </small><br>
-**<small>Generative:</small>** <small>🔄LS (Lexical Substitution), 📝SU (Summarization), 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction) </small>
 """
 # Which evaluations are you running? how can people reproduce what you have?

 INTRODUCTION_TEXT = """
 Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding translation issues and potential cultural biases; (ii) the benchmark includes **generative** tasks, enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
+**<small>Multiple-choice tasks:</small>** <small> 📊TE (Textual Entailment), 😃SA (Sentiment Analysis), ⚠️HS (Hate Speech Detection), 🏥AT (Admission Test), 🔤WIC (Word in Context), ❓FAQ (Frequently Asked Questions) </small><br>
+**<small>Generative tasks:</small>** <small>🔄LS (Lexical Substitution), 📝SU (Summarization), 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction) </small>
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/tasks.py CHANGED Viewed

@@ -23,7 +23,7 @@ Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on
 MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
 # Tasks Descriptions
-TE_DESCRIPTION = """### Textual Entailment (TE) *(Multiple Choice)*
     The input are two sentences: the text (T) and the hypothesis (H). The model  has to determine whether the meaning of the hypothesis is logically entailed by the text.
 | #   | Prompt | Answer Choices |
@@ -39,7 +39,7 @@ TE_DESCRIPTION = """### Textual Entailment (TE) *(Multiple Choice)*
 """
-SA_DESCRIPTION = """### Sentiment Analysis (SA) *(Multiple Choice)*
     The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
 | #   | Prompt                                                                                       | Answer Choices               |
@@ -55,7 +55,7 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA) *(Multiple Choice)*
 """
-HS_DESCRIPTION = """### Hate Speech (HS) *(Multiple Choice)*
     The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
 | #   | Prompt                                                                                       | Answer Choices                                   |
@@ -71,7 +71,7 @@ HS_DESCRIPTION = """### Hate Speech (HS) *(Multiple Choice)*
 """
-AT_DESCRIPTION = """### Admission Tests (AT) *(Multiple Choice)*
     The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
 | #   | Prompt                                                                                       | Answer Choices               |
@@ -87,7 +87,7 @@ AT_DESCRIPTION = """### Admission Tests (AT) *(Multiple Choice)*
 """
-WIC_DESCRIPTION = """### Word in Context (WIC) *(Multiple Choice)*
     The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).
 | #   | Prompt                                                                                       | Answer Choices                                   |
@@ -103,7 +103,7 @@ WIC_DESCRIPTION = """### Word in Context (WIC) *(Multiple Choice)*
 """
-FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) *(Multiple Choice)*
     The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
 | #   | Prompt                                                                                       | Answer Choices               |
@@ -119,7 +119,7 @@ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) *
 """
-LS_DESCRIPTION = """### Lexical Substitution (LS) *(Generative)*
     The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
 | #   | Prompt                                                                                       |
@@ -131,7 +131,7 @@ LS_DESCRIPTION = """### Lexical Substitution (LS) *(Generative)*
 """
-SU_DESCRIPTION = """### Summarization (SUM) *(Generative)*
     The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
 | #   | Prompt                                                                                       |
@@ -143,7 +143,7 @@ SU_DESCRIPTION = """### Summarization (SUM) *(Generative)*
 """
-NER_DESCRIPTION = """### Named Entity Recognition (NER) *(Generative)*
     The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
 | #   | Prompt                                                                                       |
@@ -155,7 +155,7 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER) *(Generative)*
 """
-REL_DESCRIPTION = """### Relation Extraction (REL) *(Generative)*
     The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
 | #   | Prompt                                                                                       |

 MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
 # Tasks Descriptions
+TE_DESCRIPTION = """### Textual Entailment (TE) --- *Multiple-choice task*
     The input are two sentences: the text (T) and the hypothesis (H). The model  has to determine whether the meaning of the hypothesis is logically entailed by the text.
 | #   | Prompt | Answer Choices |
 """
+SA_DESCRIPTION = """### Sentiment Analysis (SA) --- *Multiple-choice task*
     The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
 | #   | Prompt                                                                                       | Answer Choices               |
 """
+HS_DESCRIPTION = """### Hate Speech (HS) --- *Multiple-choice task*
     The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
 | #   | Prompt                                                                                       | Answer Choices                                   |
 """
+AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task*
     The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
 | #   | Prompt                                                                                       | Answer Choices               |
 """
+WIC_DESCRIPTION = """### Word in Context (WIC) --- *Multiple-choice task*
     The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).
 | #   | Prompt                                                                                       | Answer Choices                                   |
 """
+FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- *Multiple-choice task*
     The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
 | #   | Prompt                                                                                       | Answer Choices               |
 """
+LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task*
     The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
 | #   | Prompt                                                                                       |
 """
+SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task*
     The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
 | #   | Prompt                                                                                       |
 """
+NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task*
     The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
 | #   | Prompt                                                                                       |
 """
+REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task*
     The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
 | #   | Prompt                                                                                       |