Spaces:
Runtime error
Runtime error
update description
Browse files- src/about.py +7 -7
src/about.py
CHANGED
|
@@ -33,10 +33,10 @@ class Tasks(Enum):
|
|
| 33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
| 34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
| 35 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
| 36 |
-
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 37 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
| 38 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
| 39 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
|
|
|
| 40 |
|
| 41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 42 |
# ---------------------------------------------------
|
|
@@ -58,9 +58,11 @@ TITLE = """
|
|
| 58 |
INTRODUCTION_TEXT = """
|
| 59 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
| 60 |
|
| 61 |
-
Almost every task has two versions: regex and multiple choice.
|
| 62 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
| 63 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
|
|
|
|
|
|
| 64 |
"""
|
| 65 |
|
| 66 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
@@ -75,11 +77,9 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
| 75 |
|
| 76 |
* fix long model names
|
| 77 |
* add inference time
|
| 78 |
-
* add metadata for models (e.g. #Params)
|
| 79 |
* add more tasks
|
| 80 |
* use model templates
|
| 81 |
* fix scrolling on Firefox
|
| 82 |
-
* polish_poleval2018_task3_test_10k - IN PROGRESS
|
| 83 |
|
| 84 |
## Tasks
|
| 85 |
|
|
@@ -103,10 +103,10 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
| 103 |
| cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
|
| 104 |
| klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
|
| 105 |
| klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
|
|
|
|
|
|
|
|
|
|
| 106 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
| 107 |
-
| polqa_reranking_mc | ipipan/polqa | accuracy | other |
|
| 108 |
-
| polqa_open_book_g | ipipan/polqa | levenshtein | other |
|
| 109 |
-
| polqa_closed_book_g | ipipan/polqa | levenshtein | other |
|
| 110 |
|
| 111 |
## Reproducibility
|
| 112 |
To reproduce our results, you need to clone the repository:
|
|
|
|
| 33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
| 34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
| 35 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
|
|
|
| 36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
| 37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
| 38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
| 39 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
| 40 |
|
| 41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 42 |
# ---------------------------------------------------
|
|
|
|
| 58 |
INTRODUCTION_TEXT = """
|
| 59 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
| 60 |
|
| 61 |
+
Almost every task has two versions: regex and multiple choice.
|
| 62 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
| 63 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
| 64 |
+
|
| 65 |
+
Average columns are normalized against scores by "Baseline (majority class)".
|
| 66 |
"""
|
| 67 |
|
| 68 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
|
| 77 |
|
| 78 |
* fix long model names
|
| 79 |
* add inference time
|
|
|
|
| 80 |
* add more tasks
|
| 81 |
* use model templates
|
| 82 |
* fix scrolling on Firefox
|
|
|
|
| 83 |
|
| 84 |
## Tasks
|
| 85 |
|
|
|
|
| 103 |
| cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
|
| 104 |
| klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
|
| 105 |
| klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
|
| 106 |
+
| polqa_reranking_mc | ipipan/polqa | accuracy | multiple_choice |
|
| 107 |
+
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 108 |
+
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
| 109 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
## Reproducibility
|
| 112 |
To reproduce our results, you need to clone the repository:
|