open_benchmark_index / tasks_index.json
Linker1907's picture
init
8628943
{
"tasks": [
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/acva.py",
"module": "src/lighteval/tasks/multilingual/tasks/acva.py",
"abstract": "Acva multilingual benchmark.",
"languages": [
"arabic"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "OALL/ACVA",
"name": "Acva"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py",
"module": "src/lighteval/tasks/multilingual/tasks/afri_mgsm.py",
"abstract": "African MGSM: MGSM for African Languages",
"languages": [
"amharic",
"ewe",
"french",
"hausa",
"igbo",
"kinyarwanda",
"lingala",
"luganda",
"oromo",
"shona",
"sotho",
"swahili",
"twi",
"wolof",
"xhosa",
"yoruba",
"zulu"
],
"tags": [
"math",
"multilingual",
"reasoning"
],
"paper": "https://arxiv.org/abs/2406.03368.",
"dataset": "masakhane/afrimgsm",
"name": "Afri Mgsm"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/afri_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/afri_mmlu.py",
"abstract": "African MMLU: African Massive Multitask Language Understanding",
"languages": [
"amharic",
"ewe",
"french",
"hausa",
"igbo",
"kinyarwanda",
"lingala",
"luganda",
"oromo",
"shona",
"sotho",
"swahili",
"twi",
"wolof",
"xhosa",
"yoruba",
"zulu"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2406.03368.",
"dataset": "masakhane/afrimmlu",
"name": "Afri Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/afri_xnli.py",
"module": "src/lighteval/tasks/multilingual/tasks/afri_xnli.py",
"abstract": "African XNLI: African XNLI",
"languages": [
"amharic",
"ewe",
"french",
"hausa",
"igbo",
"kinyarwanda",
"lingala",
"luganda",
"oromo",
"shona",
"sotho",
"swahili",
"twi",
"wolof",
"xhosa",
"yoruba",
"zulu"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/2406.03368.",
"dataset": "masakhane/afrixnli",
"name": "Afri Xnli"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/arabic.py",
"module": "src/lighteval/tasks/multilingual/tasks/arabic.py",
"abstract": "Collection of benchmarks for Arabic language.",
"languages": [
"arabic"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "MBZUAI/ArabicMMLU, MBZUAI/human_translated_arabic_mmlu, OALL/Arabic_MMLU, OALL/ACVA, asas-ai/AraTrust-categorized",
"name": "Arabic Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/arabic_arc.py",
"module": "src/lighteval/tasks/multilingual/tasks/arabic_arc.py",
"abstract": "Arabic Arc multilingual benchmark.",
"languages": [
"arabic"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
"name": "Arabic Arc"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/arabic_mmlu.py",
"abstract": "Arabic Mmlu multilingual benchmark.",
"languages": [
"arabic"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "MBZUAI/ArabicMMLU",
"name": "Arabic Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/arcd.py",
"module": "src/lighteval/tasks/multilingual/tasks/arcd.py",
"abstract": "ARCD: Arabic Reading Comprehension Dataset.",
"languages": [
"arabic"
],
"tags": [
"multilingual",
"multiple-choice",
"qa",
"reasoning"
],
"paper": "https://arxiv.org/pdf/1906.05394",
"dataset": "hsseinmz/arcd",
"name": "Arcd"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/belebele.py",
"module": "src/lighteval/tasks/multilingual/tasks/belebele.py",
"abstract": "Belebele: A large-scale reading comprehension dataset covering 122 languages.",
"languages": [
"arabic",
"armenian",
"bengali",
"cyrillic",
"devanagari",
"ethiopic",
"georgian",
"greek",
"gujarati",
"gurmukhi",
"chinese (simplified)",
"chinese (traditional)",
"hangul",
"hebrew",
"japanese",
"khmer",
"kannada",
"lao",
"latin",
"malayalam",
"myanmar",
"odia",
"sinhala",
"tamil",
"telugu",
"thai",
"tibetan"
],
"tags": [
"multilingual",
"multiple-choice",
"reading-comprehension"
],
"paper": "https://arxiv.org/abs/2308.16884",
"dataset": "facebook/belebele",
"name": "Belebele"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/c3.py",
"module": "src/lighteval/tasks/multilingual/tasks/c3.py",
"abstract": "C3: A Chinese Challenge Corpus for Cross-lingual and Cross-modal Tasks Reading\ncomprehension task part of clue.",
"languages": [
"chinese"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/2004.05986",
"dataset": "clue/clue",
"name": "C3"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/ceval.py",
"module": "src/lighteval/tasks/multilingual/tasks/ceval.py",
"abstract": "Ceval multilingual benchmark.",
"languages": [
"chinese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "ceval/ceval-exam",
"name": "Ceval"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/chegeka.py",
"module": "src/lighteval/tasks/multilingual/tasks/chegeka.py",
"abstract": "Chegeka multilingual benchmark.",
"languages": [
"russian"
],
"tags": [
"knowledge",
"multilingual",
"qa"
],
"paper": null,
"dataset": "ai-forever/MERA",
"name": "Chegeka"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/chinese_squad.py",
"module": "src/lighteval/tasks/multilingual/tasks/chinese_squad.py",
"abstract": "ChineseSquad is a reading comprehension dataset for Chinese.",
"languages": [
"chinese"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://github.com/pluto-junzeng/ChineseSquad",
"dataset": "lighteval/ChineseSquad",
"name": "Chinese Squad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/cmath.py",
"module": "src/lighteval/tasks/multilingual/tasks/cmath.py",
"abstract": "Cmath multilingual benchmark.",
"languages": [
"chinese"
],
"tags": [
"math",
"multilingual",
"reasoning"
],
"paper": null,
"dataset": "weitianwen/cmath",
"name": "Cmath"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/cmmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/cmmlu.py",
"abstract": "Cmmlu multilingual benchmark.",
"languages": [
"chinese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "haonan-li/cmmlu",
"name": "Cmmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/cmnli.py",
"module": "src/lighteval/tasks/multilingual/tasks/cmnli.py",
"abstract": "Native Chinese NLI dataset based on MNLI approach (Machine Translated)",
"languages": [
"chinese"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/2004.05986",
"dataset": "fenffef/cmnli",
"name": "Cmnli"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/cmrc2018.py",
"module": "src/lighteval/tasks/multilingual/tasks/cmrc2018.py",
"abstract": "CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese.",
"languages": [
"chinese"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/1810.07366",
"dataset": "clue/clue",
"name": "Cmrc2018"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/copa_indic.py",
"module": "src/lighteval/tasks/multilingual/tasks/copa_indic.py",
"abstract": "IndicCOPA: COPA for Indic Languages Paper: https://arxiv.org/pdf/2212.05409\nIndicCOPA extends COPA to 15 Indic languages, providing a valuable resource for\nevaluating common sense reasoning in these languages.",
"languages": [
"assamese",
"bengali",
"gujarati",
"hindi",
"kannada",
"malayalam",
"marathi",
"nepali",
"oriya",
"punjabi",
"sanskrit",
"sindhi",
"tamil",
"telugu",
"urdu"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/pdf/2212.05409",
"dataset": "ai4bharat/IndicCOPA",
"name": "Copa Indic"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/enem.py",
"module": "src/lighteval/tasks/multilingual/tasks/enem.py",
"abstract": "ENEM (Exame Nacional do Ensino Médio) is a standardized Brazilian national\nsecondary education examination. The exam is used both as a university admission\ntest and as a high school evaluation test.",
"languages": [
"portuguese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://huggingface.co/datasets/maritaca-ai/enem",
"dataset": "maritaca-ai/enem",
"name": "Enem"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/exams.py",
"module": "src/lighteval/tasks/multilingual/tasks/exams.py",
"abstract": "Exams multilingual benchmark.",
"languages": [
"albanian",
"arabic",
"bulgarian",
"croatian",
"french",
"german",
"hungarian",
"italian",
"lithuanian",
"macedonian",
"polish",
"portuguese",
"serbian",
"spanish",
"turkish",
"vietnamese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "mhardalov/exams",
"name": "Exams"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/faquad.py",
"module": "src/lighteval/tasks/multilingual/tasks/faquad.py",
"abstract": "FaQuAD: A Portuguese Reading Comprehension Dataset",
"languages": [
"portuguese"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2007.15671",
"dataset": "eraldoluis/faquad",
"name": "Faquad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/filipino.py",
"module": "src/lighteval/tasks/multilingual/tasks/filipino.py",
"abstract": "Collection of benchmarks for Filipino language.",
"languages": [
"filipino"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://github.com/filbench/filbench-eval/blob/main/filbench.pdf\nContact:\n- Lester James V. Miranda <ljvmiranda@gmail.com>\n- Elyanah Aco <elyanah.aco02@gmail.com>\n- Conner Manuel <manuel.conner.g@berkeley.edu>\n- Jan Christian Blaise Cruz <jcbcruz02@gmail.com>\n- Joseph Imperial <jmri20@bath.ac.uk>",
"dataset": "filbench/filbench-eval",
"name": "Filipino Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/flores200.py",
"module": "src/lighteval/tasks/multilingual/tasks/flores200.py",
"abstract": "Flores200 multilingual benchmark.",
"languages": [
"arabic",
"armenian",
"bengali",
"cyrillic",
"devanagari",
"ethiopic",
"georgian",
"greek",
"gujarati",
"gurmukhi",
"chinese (simplified)",
"chinese (traditional)",
"hangul",
"hebrew",
"japanese",
"khmer",
"kannada",
"lao",
"latin",
"malayalam",
"myanmar",
"odia",
"sinhala",
"tamil",
"telugu",
"thai",
"tibetan"
],
"tags": [
"multilingual",
"translation"
],
"paper": null,
"dataset": "facebook/flores",
"name": "Flores200"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/fquad_v2.py",
"module": "src/lighteval/tasks/multilingual/tasks/fquad_v2.py",
"abstract": "FQuAD v2: French Question Answering Dataset version 2.",
"languages": [
"french"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2002.06071",
"dataset": "manu/fquad2_test",
"name": "Fquad V2"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/french.py",
"module": "src/lighteval/tasks/multilingual/tasks/french.py",
"abstract": "Collection of benchmarks for the french language.",
"languages": [
"french"
],
"tags": [
"knowledge",
"multiple-choice",
"qa"
],
"paper": "https://huggingface.co/fr-gouv-coordination-ia",
"dataset": "fr-gouv-coordination-ia/IFEval-fr, fr-gouv-coordination-ia/gpqa-fr, fr-gouv-coordination-ia/bac-fr",
"name": "French Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/french_boolq.py",
"module": "src/lighteval/tasks/multilingual/tasks/french_boolq.py",
"abstract": "French Boolq multilingual benchmark.",
"languages": [
"french"
],
"tags": [
"classification",
"multilingual",
"qa"
],
"paper": null,
"dataset": "manu/french_boolq",
"name": "French Boolq"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/french_triviqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/french_triviqa.py",
"abstract": "French Triviqa multilingual benchmark.",
"languages": [
"french"
],
"tags": [
"multilingual",
"qa"
],
"paper": null,
"dataset": "manu/french-trivia",
"name": "French Triviqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/german_rag.py",
"module": "src/lighteval/tasks/multilingual/tasks/german_rag.py",
"abstract": "Collection of benchmarks for the German language.",
"languages": [
"german"
],
"tags": [
"knowledge",
"reasoning",
"multiple-choice"
],
"paper": "https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval",
"dataset": "deutsche-telekom/Ger-RAG-eval",
"name": "German RAG Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/germanquad.py",
"module": "src/lighteval/tasks/multilingual/tasks/germanquad.py",
"abstract": "GermanQuAD: High-quality German QA dataset with 13,722 questions.",
"languages": [
"german"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2104.12741",
"dataset": "deepset/germanquad",
"name": "Germanquad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/global_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/global_mmlu.py",
"abstract": "Translated MMLU using both professional and non-professional translators.\nContains tags for cultural sensitivity.",
"languages": [
"amharic",
"arabic",
"bengali",
"chinese",
"czech",
"dutch",
"english",
"french",
"german",
"hebrew",
"hindi",
"indonesian",
"italian",
"japanese",
"korean",
"malay",
"norwegian",
"polish",
"portuguese",
"romanian",
"russian",
"serbian",
"spanish",
"swahili",
"swedish",
"tamil",
"telugu",
"thai",
"turkish",
"ukrainian",
"urdu",
"vietnamese",
"yoruba",
"zulu"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://huggingface.co/papers/2412.03304",
"dataset": "CohereForAI/Global-MMLU",
"name": "Global Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py",
"module": "src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py",
"abstract": "Hellaswag Hin multilingual benchmark.",
"languages": [
"hindi"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "ai4bharat/hellaswag-hi",
"name": "Hellaswag Hin"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py",
"module": "src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py",
"abstract": "Hellaswag Tel multilingual benchmark.",
"languages": [
"telugu"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "LightFury9/hellaswag-telugu",
"name": "Hellaswag Tel"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py",
"module": "src/lighteval/tasks/multilingual/tasks/hellaswag_tha.py",
"abstract": "Hellaswag Thai This is a Thai adaptation of the Hellaswag task. Similar to the\nTurkish version, there's no specific paper, but it has been found to be\neffective for evaluating Thai language models on commonsense reasoning tasks.",
"languages": [
"thai"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "lighteval/hellaswag_thai",
"name": "Hellaswag Tha"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py",
"module": "src/lighteval/tasks/multilingual/tasks/hellaswag_tur.py",
"abstract": "Hellaswag Turkish This is a Turkish adaptation of the Hellaswag task. While\nthere's no specific paper for this version, it has been found to work well for\nevaluating Turkish language models on commonsense reasoning tasks. We don't\nhandle them in single task as there is quite a lot of differences\n(dataset/subset, dot replacement, etc.) which would make it hard to read",
"languages": [
"turkish"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "malhajar/hellaswag_tr-v0.2",
"name": "Hellaswag Tur"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hindi_arc.py",
"module": "src/lighteval/tasks/multilingual/tasks/hindi_arc.py",
"abstract": "Hindi Arc multilingual benchmark.",
"languages": [
"hindi"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "ai4bharat/ai2_arc-hi",
"name": "Hindi Arc"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py",
"module": "src/lighteval/tasks/multilingual/tasks/hindi_boolq.py",
"abstract": "Hindi Boolq multilingual benchmark.",
"languages": [
"gujarati",
"hindi",
"malayalam",
"marathi",
"tamil"
],
"tags": [
"classification",
"multilingual",
"qa"
],
"paper": null,
"dataset": "ai4bharat/boolq-hi",
"name": "Hindi Boolq"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/indicqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/indicqa.py",
"abstract": "IndicQA: A reading comprehension dataset for 11 Indian languages.",
"languages": [
"assamese",
"bengali",
"gujarati",
"hindi",
"kannada",
"malayalam",
"marathi",
"oriya",
"punjabi",
"tamil",
"telugu"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2407.13522",
"dataset": "ai4bharat/IndicQA",
"name": "Indicqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/kenswquad.py",
"module": "src/lighteval/tasks/multilingual/tasks/kenswquad.py",
"abstract": "KenSwQuAD: A question answering dataset for Kenyan Swahili.",
"languages": [
"swahili"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2205.02364",
"dataset": "lighteval/KenSwQuAD",
"name": "Kenswquad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/m3exams.py",
"module": "src/lighteval/tasks/multilingual/tasks/m3exams.py",
"abstract": "M3Exam: Multitask Multilingual Multimodal Evaluation Benchmark It also contains\na multimodal version but we don't support that Paper:\nhttps://arxiv.org/abs/2306.05179",
"languages": [
"afrikaans",
"chinese",
"english",
"italian",
"javanese",
"portuguese",
"swahili",
"thai",
"vietnamese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2306.05179",
"dataset": "chiayewken/m3exam",
"name": "M3Exams"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py",
"module": "src/lighteval/tasks/multilingual/tasks/mathlogicqa_rus.py",
"abstract": "MathLogicQA is a dataset for evaluating mathematical reasoning in language\nmodels. It consists of multiple-choice questions that require logical reasoning\nand mathematical problem-solving. This Russian version is part of the MERA\n(Multilingual Evaluation of Reasoning Abilities) benchmark.",
"languages": [
"russian"
],
"tags": [
"math",
"multilingual",
"qa",
"reasoning"
],
"paper": "https://github.com/ai-forever/MERA",
"dataset": "ai-forever/MERA",
"name": "Mathlogicqa Rus"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/meta_mmlu.py",
"abstract": "Meta MMLU: A multilingual version of MMLU (using google translation)",
"languages": [
"french",
"german",
"hindi",
"italian",
"portuguese",
"spanish",
"thai"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2407.21783",
"dataset": "meta-llama/Meta-Llama-3.1-8B-Instruct-evals",
"name": "Meta Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mgsm.py",
"module": "src/lighteval/tasks/multilingual/tasks/mgsm.py",
"abstract": "Mgsm multilingual benchmark.",
"languages": [
"bengali",
"chinese",
"english",
"french",
"german",
"japanese",
"russian",
"spanish",
"swahili",
"telugu",
"thai"
],
"tags": [
"math",
"multilingual",
"reasoning"
],
"paper": null,
"dataset": "juletxara/mgsm",
"name": "Mgsm"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mintaka.py",
"module": "src/lighteval/tasks/multilingual/tasks/mintaka.py",
"abstract": "Mintaka multilingual benchmark.",
"languages": [
"arabic",
"english",
"french",
"german",
"hindi",
"italian",
"japanese",
"portuguese",
"spanish"
],
"tags": [
"knowledge",
"multilingual",
"qa"
],
"paper": null,
"dataset": "AmazonScience/mintaka",
"name": "Mintaka"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mkqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/mkqa.py",
"abstract": "Mkqa multilingual benchmark.",
"languages": [
"arabic",
"chinese",
"chinese_hong_kong",
"chinese_traditional",
"danish",
"dutch",
"english",
"finnish",
"french",
"german",
"hebrew",
"hungarian",
"italian",
"japanese",
"khmer",
"korean",
"malay",
"norwegian",
"polish",
"portuguese",
"russian",
"spanish",
"swedish",
"thai",
"turkish",
"vietnamese"
],
"tags": [
"multilingual",
"qa"
],
"paper": null,
"dataset": "apple/mkqa",
"name": "Mkqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py",
"module": "src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py",
"abstract": "ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires\nreasoning. It consists of multiple-choice science questions from 3rd to 9th\ngrade exams. The dataset is split into two parts: ARC-Easy and ARC-Challenge.\nARC-Easy contains questions that can be answered correctly by both humans and\nsimple baseline models. ARC-Challenge contains questions that are difficult for\nboth humans and current AI systems. Similar to MMLU, ARC tasks uses PMI\nnormalization by default but only for the challenge set.",
"languages": [
"arabic",
"bengali",
"catalan",
"chinese",
"croatian",
"danish",
"dutch",
"french",
"german",
"hindi",
"hungarian",
"indonesian",
"italian",
"kannada",
"malayalam",
"marathi",
"nepali",
"romanian",
"russian",
"serbian",
"slovak",
"spanish",
"tamil",
"telugu",
"ukrainian",
"vietnamese"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://github.com/nlp-uoregon/mlmm-evaluation",
"dataset": "jon-tow/okapi_arc_challenge",
"name": "Mlmm Arc Challenge"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py",
"module": "src/lighteval/tasks/multilingual/tasks/mlmm_hellaswag.py",
"abstract": "Hellaswag is a commonsense reasoning task that requires models to complete a\ngiven scenario with the most plausible ending. It tests the model's ability to\nunderstand and reason about everyday situations and human behavior.\nMLMM-Hellaswag: Multilingual adaptation of Hellaswag",
"languages": [
"arabic",
"armenian",
"basque",
"bengali",
"catalan",
"chinese",
"croatian",
"danish",
"dutch",
"french",
"german",
"gujarati",
"hindi",
"hungarian",
"icelandic",
"indonesian",
"italian",
"kannada",
"malayalam",
"marathi",
"nepali",
"norwegian",
"portuguese",
"romanian",
"russian",
"serbian",
"slovak",
"spanish",
"swedish",
"tamil",
"telugu",
"ukrainian",
"vietnamese"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/2306.07610",
"dataset": "jon-tow/okapi_hellaswag",
"name": "Mlmm Hellaswag"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/mlmm_mmlu.py",
"abstract": "MLMM MMLU: Another multilingual version of MMLU",
"languages": [
"arabic",
"bengali",
"catalan",
"chinese",
"croatian",
"danish",
"dutch",
"french",
"german",
"hindi",
"hungarian",
"indonesian",
"italian",
"kannada",
"malayalam",
"marathi",
"nepali",
"romanian",
"russian",
"serbian",
"slovak",
"spanish",
"tamil",
"telugu",
"ukrainian",
"vietnamese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://github.com/nlp-uoregon/mlmm-evaluation",
"dataset": "jon-tow/okapi_mmlu",
"name": "Mlmm Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/mlmm_truthfulqa.py",
"abstract": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
"languages": [
"arabic",
"armenian",
"basque",
"bengali",
"catalan",
"chinese",
"croatian",
"danish",
"dutch",
"french",
"german",
"gujarati",
"hindi",
"hungarian",
"icelandic",
"indonesian",
"italian",
"kannada",
"malayalam",
"marathi",
"nepali",
"norwegian",
"portuguese",
"romanian",
"russian",
"serbian",
"slovak",
"spanish",
"swedish",
"tamil",
"telugu",
"ukrainian",
"vietnamese"
],
"tags": [
"factuality",
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2109.07958",
"dataset": "jon-tow/okapi_truthfulqa",
"name": "Mlmm Truthfulqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/mlqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/mlqa.py",
"abstract": "MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating\ncross-lingual question answering performance. It consists of QA instances in 7\nlanguages: English, Arabic, German, Spanish, Hindi, Vietnamese, and Chinese. The\ndataset is derived from the SQuAD v1.1 dataset, with questions and contexts\ntranslated by professional translators.",
"languages": [
"arabic",
"chinese",
"german",
"hindi",
"spanish",
"vietnamese"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/1910.07475",
"dataset": "facebook/mlqa",
"name": "Mlqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/oab_exams.py",
"module": "src/lighteval/tasks/multilingual/tasks/oab_exams.py",
"abstract": "OAB Exams: A collection of questions from the Brazilian Bar Association exam The\nexam is required for anyone who wants to practice law in Brazil",
"languages": [
"portuguese"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": "https://huggingface.co/datasets/eduagarcia/oab_exams",
"dataset": "eduagarcia/oab_exams",
"name": "Oab Exams"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/ocnli.py",
"module": "src/lighteval/tasks/multilingual/tasks/ocnli.py",
"abstract": "Native Chinese NLI dataset based.",
"languages": [
"chinese"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/pdf/2010.05444",
"dataset": "clue/clue",
"name": "Ocnli"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/openai_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/openai_mmlu.py",
"abstract": "Openai Mmlu multilingual benchmark.",
"languages": [
"arabic",
"bengali",
"chinese",
"french",
"german",
"hindi",
"indonesian",
"italian",
"japanese",
"korean",
"portuguese",
"spanish",
"swahili",
"yoruba"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "openai/MMMLU",
"name": "Openai Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/openbook_ara.py",
"module": "src/lighteval/tasks/multilingual/tasks/openbook_ara.py",
"abstract": "OpenBookQA: A Question-Answering Dataset for Open-Book Exams OpenBookQA is a\nquestion-answering dataset modeled after open-book exams for assessing human\nunderstanding of a subject. It consists of multiple-choice questions that\nrequire combining facts from a given open book with broad common knowledge. The\ntask tests language models' ability to leverage provided information and apply\ncommon sense reasoning.",
"languages": [
"arabic"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/1809.02789",
"dataset": "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
"name": "Openbook Ara"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/openbook_es.py",
"module": "src/lighteval/tasks/multilingual/tasks/openbook_es.py",
"abstract": "Spanish version of OpenBookQA from BSC Language Technology group",
"languages": [
"spanish"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://huggingface.co/datasets/BSC-LT/openbookqa-es",
"dataset": "BSC-LT/openbookqa-es",
"name": "Openbook Es"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/openbook_rus.py",
"module": "src/lighteval/tasks/multilingual/tasks/openbook_rus.py",
"abstract": "The Russian version is part of the MERA (Multilingual Enhanced Russian NLP\nArchitectures) project.",
"languages": [
"russian"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/2401.04531",
"dataset": "ai-forever/MERA",
"name": "Openbook Rus"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/oz.py",
"module": "src/lighteval/tasks/multilingual/tasks/oz.py",
"abstract": "OZ Eval (sr. Opšte Znanje Evaluacija) dataset was created for the purposes of\nevaluating General Knowledge of LLM models in Serbian language. Data consists\nof 1k+ high-quality questions and answers which were used as part of entry exams\nat the Faculty of Philosophy and Faculty of Organizational Sciences, University\nof Belgrade. The exams test the General Knowledge of students and were used in\nthe enrollment periods from 2003 to 2024.",
"languages": [
"serbian"
],
"tags": [
"knowledge",
"multiple-choice"
],
"paper": null,
"dataset": "DjMel/oz-eval",
"name": "OZ Serbian Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/parus.py",
"module": "src/lighteval/tasks/multilingual/tasks/parus.py",
"abstract": "PARus: Plausible Alternatives for Russian PARus is the Russian adaptation of the\nCOPA task, part of the Russian SuperGLUE benchmark. It evaluates common sense\nreasoning and causal inference abilities in Russian language models.",
"languages": [
"russian"
],
"tags": [
"multilingual"
],
"paper": "https://russiansuperglue.com/tasks/task_info/PARus",
"dataset": "ai-forever/MERA",
"name": "Parus"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/paws_x.py",
"module": "src/lighteval/tasks/multilingual/tasks/paws_x.py",
"abstract": "PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification This\ndataset contains paraphrase identification pairs in multiple languages. It's\nderived from PAWS (Paraphrase Adversaries from Word Scrambling) and We treat\nparaphrase as entailment and non-paraphrase as contradiction",
"languages": [
"chinese",
"english",
"french",
"german",
"japanese",
"korean",
"spanish"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/1908.11828",
"dataset": "google-research-datasets/paws-x",
"name": "Paws X"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/piqa_ar.py",
"module": "src/lighteval/tasks/multilingual/tasks/piqa_ar.py",
"abstract": "PIQA: Physical Interaction Question Answering PIQA is a benchmark for testing\nphysical commonsense reasoning. This Arabic version is a translation of the\noriginal PIQA dataset, adapted for Arabic language evaluation. It tests the\nability to reason about physical interactions in everyday situations.",
"languages": [
"arabic"
],
"tags": [
"multilingual",
"multiple-choice",
"qa",
"reasoning"
],
"paper": "https://arxiv.org/abs/1911.11641",
"dataset": "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
"name": "Piqa Ar"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/rcb.py",
"module": "src/lighteval/tasks/multilingual/tasks/rcb.py",
"abstract": "Russian Commitment Bank (RCB) is a large-scale NLI dataset with Russian\nsentences, collected from the web and crowdsourcing.",
"languages": [
"russian"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/2401.04531",
"dataset": "ai-forever/MERA",
"name": "Rcb"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/sber_squad.py",
"module": "src/lighteval/tasks/multilingual/tasks/sber_squad.py",
"abstract": "SberQuAD: A large-scale Russian reading comprehension dataset.",
"languages": [
"russian"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/1912.09723",
"dataset": "kuznetsoffandrey/sberquad",
"name": "Sber Squad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/serbian_eval.py",
"module": "src/lighteval/tasks/multilingual/tasks/serbian_eval.py",
"abstract": "The tasks cover a variety of benchmarks, including: standard task like ARC[E][C],\nBoolQ, Hellaswag, OpenBookQA,PIQA, Winogrande and a custom OZ Eval.\nMMLU is separated by subject and also all in one.",
"languages": [
"serbian"
],
"tags": [
"knowledge",
"multiple-choice"
],
"paper": null,
"dataset": "datatab/serbian-llm-benchmark",
"name": "Serbian Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/soqal.py",
"module": "src/lighteval/tasks/multilingual/tasks/soqal.py",
"abstract": "SOQAL: A large-scale Arabic reading comprehension dataset.",
"languages": [
"arabic"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/1906.05394",
"dataset": "OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
"name": "Soqal"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/squad_es.py",
"module": "src/lighteval/tasks/multilingual/tasks/squad_es.py",
"abstract": "SQuAD-es: Spanish translation of the Stanford Question Answering Dataset",
"languages": [
"spanish"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://huggingface.co/datasets/ccasimiro/squad_es",
"dataset": "ccasimiro/squad_es",
"name": "Squad Es"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/squad_it.py",
"module": "src/lighteval/tasks/multilingual/tasks/squad_it.py",
"abstract": "SQuAD-it: Italian translation of the SQuAD dataset.",
"languages": [
"italian"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://github.com/crux82/squad-it",
"dataset": "crux82/squad_it",
"name": "Squad It"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/swahili_arc.py",
"module": "src/lighteval/tasks/multilingual/tasks/swahili_arc.py",
"abstract": "Swahili Arc multilingual benchmark.",
"languages": [
"swahili"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": null,
"name": "Swahili Arc"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/thai_exams.py",
"module": "src/lighteval/tasks/multilingual/tasks/thai_exams.py",
"abstract": "Thai Exams multilingual benchmark.",
"languages": [
"thai"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "scb10x/thai_exam",
"name": "Thai Exams"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/thaiqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/thaiqa.py",
"abstract": "ThaiQA: A question answering dataset for the Thai language.",
"languages": [
"thai"
],
"tags": [
"multilingual",
"qa"
],
"paper": null,
"dataset": "lighteval/thaiqa_squad_fixed",
"name": "Thaiqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/tquad_v2.py",
"module": "src/lighteval/tasks/multilingual/tasks/tquad_v2.py",
"abstract": "TQuAD v2: Turkish Question Answering Dataset version 2.",
"languages": [
"turkish"
],
"tags": [
"multilingual",
"qa"
],
"paper": null,
"dataset": "erdometo/tquad2",
"name": "Tquad V2"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/turkic.py",
"module": "src/lighteval/tasks/multilingual/tasks/turkic.py",
"abstract": "TUMLU-mini is a benchmark for Turkic language understanding, comprising 1,000\nprompts organized into 10 subsets.",
"languages": [
"turkic"
],
"tags": [
"knowledge",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2502.11020",
"dataset": "jafarisbarov/TUMLU-mini",
"name": "Turkic Evals"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/turkish_arc.py",
"module": "src/lighteval/tasks/multilingual/tasks/turkish_arc.py",
"abstract": "Turkish ARC Comes from the Turkish leaderboard",
"languages": [
"turkish"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "malhajar/arc-tr",
"name": "Turkish Arc"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py",
"module": "src/lighteval/tasks/multilingual/tasks/turkish_mmlu.py",
"abstract": "Turkish Mmlu multilingual benchmark.",
"languages": [
"turkish"
],
"tags": [
"knowledge",
"multilingual",
"multiple-choice"
],
"paper": null,
"dataset": "AYueksel/TurkishMMLU",
"name": "Turkish Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/tydiqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/tydiqa.py",
"abstract": "Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002",
"languages": [
"arabic",
"bengali",
"english",
"finnish",
"indonesian",
"japanese",
"korean",
"russian",
"swahili",
"telugu",
"thai"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/2003.05002",
"dataset": "google-research-datasets/tydiqa",
"name": "Tydiqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/worldtree_rus.py",
"module": "src/lighteval/tasks/multilingual/tasks/worldtree_rus.py",
"abstract": "WorldTree is a dataset for multi-hop inference in science question answering. It\nprovides explanations for elementary science questions by combining facts from a\nsemi-structured knowledge base. This Russian version is part of the MERA\n(Multilingual Evaluation of Reasoning Abilities) benchmark.",
"languages": [
"russian"
],
"tags": [
"multilingual"
],
"paper": "https://github.com/ai-forever/MERA",
"dataset": "ai-forever/MERA",
"name": "Worldtree Rus"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xcodah.py",
"module": "src/lighteval/tasks/multilingual/tasks/xcodah.py",
"abstract": "Xcodah multilingual benchmark.",
"languages": [
"arabic",
"chinese",
"dutch",
"english",
"french",
"german",
"hindi",
"italian",
"japanese",
"polish",
"portuguese",
"russian",
"spanish",
"swahili",
"urdu",
"vietnamese"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "INK-USC/xcsr",
"name": "Xcodah"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xcopa.py",
"module": "src/lighteval/tasks/multilingual/tasks/xcopa.py",
"abstract": "COPA (Choice of Plausible Alternatives) tasks involve determining the most\nplausible cause or effect for a given premise. These tasks test common sense\nreasoning and causal inference abilities. XCOPA: Cross-lingual Choice of\nPlausible Alternatives.",
"languages": [
"arabic",
"chinese",
"estonian",
"haitian",
"indonesian",
"italian",
"quechua",
"swahili",
"tamil",
"thai",
"turkish",
"vietnamese"
],
"tags": [
"multilingual",
"multiple-choice",
"narrative",
"reasoning"
],
"paper": "https://aclanthology.org/2020.emnlp-main.185/",
"dataset": null,
"name": "Xcopa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xcsqa.py",
"module": "src/lighteval/tasks/multilingual/tasks/xcsqa.py",
"abstract": "XCSQA (Cross-lingual Commonsense QA) is part of the XCSR (Cross-lingual\nCommonsense Reasoning) benchmark It is a multilingual extension of the\nCommonsenseQA dataset, covering 16 languages The task involves answering\nmultiple-choice questions that require commonsense reasoning Uses PMI\nnormalization.",
"languages": [
"arabic",
"chinese",
"dutch",
"english",
"french",
"german",
"hindi",
"italian",
"japanese",
"polish",
"portuguese",
"russian",
"spanish",
"swahili",
"urdu",
"vietnamese"
],
"tags": [
"multilingual",
"multiple-choice",
"qa",
"reasoning"
],
"paper": "https://arxiv.org/abs/2110.08462",
"dataset": "INK-USC/xcsr",
"name": "Xcsqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xnli.py",
"module": "src/lighteval/tasks/multilingual/tasks/xnli.py",
"abstract": "NLI (Natural Language Inference) tasks involve determining the logical\nrelationship between two given sentences: a premise and a hypothesis. The goal\nis to classify whether the hypothesis is entailed by, contradicts, or is neutral\nwith respect to the premise. After our inspection we found the neutral label to\nbe quite ambiguous and decided to exclude it. But you can easily add it by\nmodifying the adapters The XNLI dataset is a multilingual variant of MultiNLI",
"languages": [
"arabic",
"bulgarian",
"chinese",
"english",
"french",
"german",
"greek",
"hindi",
"russian",
"spanish",
"swahili",
"thai",
"turkish",
"urdu",
"vietnamese"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://aclanthology.org/D18-1269/",
"dataset": "facebook/xnli",
"name": "Xnli"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xnli2.py",
"module": "src/lighteval/tasks/multilingual/tasks/xnli2.py",
"abstract": "Improvement on XNLI with better translation, from our experience models tend to\nperform better on XNLI2.0 than XNLI.",
"languages": [
"arabic",
"assamese",
"bengali",
"bulgarian",
"chinese",
"english",
"french",
"german",
"greek",
"gujarati",
"hindi",
"kannada",
"marathi",
"punjabi",
"russian",
"sanskrit",
"spanish",
"swahili",
"tamil",
"thai",
"turkish",
"urdu",
"vietnamese"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/2301.06527",
"dataset": null,
"name": "Xnli2"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xnli_indic.py",
"module": "src/lighteval/tasks/multilingual/tasks/xnli_indic.py",
"abstract": "Another variant of XNLI, with emphasis on Indic languages.",
"languages": [
"assamese",
"bengali",
"gujarati",
"hindi",
"kannada",
"malayalam",
"marathi",
"oriya",
"punjabi",
"tamil",
"telugu"
],
"tags": [
"classification",
"multilingual",
"nli"
],
"paper": "https://arxiv.org/abs/2204.08776",
"dataset": "Divyanshu/indicxnli",
"name": "Xnli Indic"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xquad.py",
"module": "src/lighteval/tasks/multilingual/tasks/xquad.py",
"abstract": "Reading Comprehension (RC) tasks evaluate a model's ability to understand and\nextract information from text passages. These tasks typically involve answering\nquestions based on given contexts, spanning multiple languages and formats. Add\nRC tasks supporting about 130 unique languages/scripts. SQuAD - like XQuAD:\nCross-lingual Question Answering Dataset, extending SQuAD to 11 languages.",
"languages": [
"arabic",
"chinese",
"english",
"german",
"greek",
"hindi",
"romanian",
"russian",
"spanish",
"thai",
"turkish",
"vietnamese"
],
"tags": [
"multilingual",
"qa"
],
"paper": "https://arxiv.org/abs/1910.11856",
"dataset": "google/xquad",
"name": "Xquad"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xstory.py",
"module": "src/lighteval/tasks/multilingual/tasks/xstory.py",
"abstract": "Xstory multilingual benchmark.",
"languages": [
"arabic",
"basque",
"burmese",
"chinese",
"hindi",
"indonesian",
"russian",
"spanish",
"swahili",
"telugu"
],
"tags": [
"multilingual",
"narrative"
],
"paper": null,
"dataset": "juletxara/xstory_cloze",
"name": "Xstory"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/multilingual/tasks/xwinograd.py",
"module": "src/lighteval/tasks/multilingual/tasks/xwinograd.py",
"abstract": "Xwinograd multilingual benchmark.",
"languages": [
"chinese",
"english",
"french",
"japanese",
"portuguese",
"russian"
],
"tags": [
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": null,
"dataset": "Muennighoff/xwinograd",
"name": "Xwinograd"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/agieval.py",
"module": "src/lighteval/tasks/tasks/agieval.py",
"abstract": "AGIEval is a human-centric benchmark specifically designed to evaluate the\ngeneral abilities of foundation models in tasks pertinent to human cognition and\nproblem-solving. This benchmark is derived from 20 official, public, and\nhigh-standard admission and qualification exams intended for general human\ntest-takers, such as general college admission tests (e.g., Chinese College\nEntrance Exam (Gaokao) and American SAT), law school admission tests, math\ncompetitions, lawyer qualification tests, and national civil service exams.",
"languages": [
"english",
"chinese"
],
"tags": [
"biology",
"chemistry",
"geography",
"history",
"knowledge",
"language",
"multiple-choice",
"physics",
"reasoning"
],
"paper": "https://arxiv.org/abs/2304.06364",
"dataset": "dmayhem93/agieval-aqua-rat, dmayhem93/agieval-gaokao-biology, dmayhem93/agieval-gaokao-chemistry, dmayhem93/agieval-gaokao-chinese, dmayhem93/agieval-gaokao-english, dmayhem93/agieval-gaokao-geography, dmayhem93/agieval-gaokao-history, dmayhem93/agieval-gaokao-mathqa, dmayhem93/agieval-gaokao-physics, dmayhem93/agieval-logiqa-en, dmayhem93/agieval-logiqa-zh, dmayhem93/agieval-lsat-ar, dmayhem93/agieval-lsat-lr, dmayhem93/agieval-lsat-rc, dmayhem93/agieval-sat-en, dmayhem93/agieval-sat-en-without-passage, dmayhem93/agieval-sat-math",
"name": "Agieval"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/aime.py",
"module": "src/lighteval/tasks/tasks/aime.py",
"abstract": "The American Invitational Mathematics Examination (AIME) is a prestigious,\ninvite-only mathematics competition for high-school students who perform in the\ntop 5% of the AMC 12 mathematics exam. It involves 15 questions of increasing\ndifficulty, with the answer to every question being a single integer from 0 to\n999. The median score is historically between 4 and 6 questions correct (out of\nthe 15 possible). Two versions of the test are given every year (thirty\nquestions total).",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://maa.org/aime-thresholds-are-available/",
"dataset": "HuggingFaceH4/aime_2024, yentinglin/aime_2025",
"name": "Aime"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/aimo.py",
"module": "src/lighteval/tasks/tasks/aimo.py",
"abstract": "Task to evaluate LLMs on the training set of the Kaggle AIMO competition:",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": null,
"dataset": "https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize",
"name": "AIMO Progress Prize 1"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/anli.py",
"module": "src/lighteval/tasks/tasks/anli.py",
"abstract": "The Adversarial Natural Language Inference (ANLI) is a new large-scale NLI\nbenchmark dataset, The dataset is collected via an iterative, adversarial\nhuman-and-model-in-the-loop procedure. ANLI is much more difficult than its\npredecessors including SNLI and MNLI. It contains three rounds. Each round has\ntrain/dev/test splits.",
"languages": [
"english"
],
"tags": [
"nli",
"reasoning"
],
"paper": "https://arxiv.org/abs/1910.14599",
"dataset": "facebook/anli",
"name": "Anli"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/arc.py",
"module": "src/lighteval/tasks/tasks/arc.py",
"abstract": "7,787 genuine grade-school level, multiple-choice science questions, assembled\nto encourage research in advanced question-answering. The dataset is partitioned\ninto a Challenge Set and an Easy Set, where the former contains only questions\nanswered incorrectly by both a retrieval-based algorithm and a word\nco-occurrence algorithm",
"languages": [
"english"
],
"tags": [
"multiple-choice"
],
"paper": "https://arxiv.org/abs/1803.05457",
"dataset": "allenai/ai2_arc",
"name": "Arc"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/arc_agi_2.py",
"module": "src/lighteval/tasks/tasks/arc_agi_2.py",
"abstract": "ARC-AGI tasks are a series of three to five input and output tasks followed by a\nfinal task with only the input listed. Each task tests the utilization of a\nspecific learned skill based on a minimal number of cognitive priors.\nIn their native form, tasks are a JSON lists of integers. These JSON can also be\nrepresented visually as a grid of colors using an ARC-AGI task viewer. You can\nview an example of a task here.\nA successful submission is a pixel-perfect description (color and position) of\nthe final task's output.\n100% of tasks in the ARC-AGI-2 dataset were solved by a minimim of 2 people in\nless than or equal to 2 attempts (many were solved more). ARC-AGI-2 is more\ndifficult for AI.",
"languages": [
"english"
],
"tags": [
"multiple-choice"
],
"paper": "https://arcprize.org/guide",
"dataset": "arc-agi-community/arc-agi-2",
"name": "ArcAgi 2"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/arithmetic.py",
"module": "src/lighteval/tasks/tasks/arithmetic.py",
"abstract": "A small battery of 10 tests that involve asking language models a simple\narithmetic problem in natural language.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2005.14165",
"dataset": "EleutherAI/arithmetic",
"name": "Arithmetic"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/asdiv.py",
"module": "src/lighteval/tasks/tasks/asdiv.py",
"abstract": "ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions\ncovering addition, subtraction, multiplication, and division.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2410.12853",
"dataset": "EleutherAI/asdiv",
"name": "Asdiv"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/babi_qa.py",
"module": "src/lighteval/tasks/tasks/babi_qa.py",
"abstract": "The bAbI benchmark for measuring understanding and reasoning, evaluates reading\ncomprehension via question answering.",
"languages": [
"english"
],
"tags": [
"qa",
"reasoning"
],
"paper": "https://arxiv.org/abs/1502.05698",
"dataset": "facebook/babi_qa",
"name": "Babi Qa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/bbq.py",
"module": "src/lighteval/tasks/tasks/bbq.py",
"abstract": "The Bias Benchmark for Question Answering (BBQ) for measuring social bias in\nquestion answering in ambiguous and unambigous context .",
"languages": [
"english"
],
"tags": [
"bias",
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/2110.08193",
"dataset": "lighteval/bbq_helm",
"name": "Bbq"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/bigbench.py",
"module": "src/lighteval/tasks/tasks/bigbench.py",
"abstract": "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models\n166 tasks from bigbench benchmark.",
"languages": [
"english"
],
"tags": [
"reasoning"
],
"paper": "https://arxiv.org/abs/2206.04615",
"dataset": "tasksource/bigbench",
"name": "Bigbench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/bigbench_hard.py",
"module": "src/lighteval/tasks/tasks/bigbench_hard.py",
"abstract": "",
"languages": [],
"tags": [
"reasoning"
],
"paper": null,
"dataset": "lighteval/bbh",
"name": "Bigbench Hard"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/blimp.py",
"module": "src/lighteval/tasks/tasks/blimp.py",
"abstract": "BLiMP is a challenge set for evaluating what language models (LMs) know\nabout major grammatical phenomena in English. BLiMP consists of 67\nsub-datasets, each containing 1000 minimal pairs isolating specific\ncontrasts in syntax, morphology, or semantics. The data is automatically\ngenerated according to expert-crafted grammars.",
"languages": [
"english"
],
"tags": [
"language-modeling"
],
"paper": "https://arxiv.org/abs/1912.00582",
"dataset": "nyu-mll/blimp",
"name": "Blimp"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/bold.py",
"module": "src/lighteval/tasks/tasks/bold.py",
"abstract": "The Bias in Open-Ended Language Generation Dataset (BOLD) for measuring biases\nand toxicity in open-ended language generation.",
"languages": [
"english"
],
"tags": [
"bias",
"generation"
],
"paper": "https://dl.acm.org/doi/10.1145/3442188.3445924",
"dataset": "lighteval/bold_helm",
"name": "Bold"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/boolq.py",
"module": "src/lighteval/tasks/tasks/boolq.py",
"abstract": "The BoolQ benchmark for binary (yes/no) question answering.",
"languages": [
"english"
],
"tags": [
"qa"
],
"paper": "https://arxiv.org/abs/1905.11946",
"dataset": "lighteval/boolq_helm",
"name": "Boolq"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/civil_comments.py",
"module": "src/lighteval/tasks/tasks/civil_comments.py",
"abstract": "The CivilComments benchmark for toxicity detection.",
"languages": [
"english"
],
"tags": [
"bias",
"classification"
],
"paper": "https://arxiv.org/abs/1903.04561",
"dataset": "lighteval/civil_comments_helm",
"name": "Civil Comments"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/commonsenseqa.py",
"module": "src/lighteval/tasks/tasks/commonsenseqa.py",
"abstract": "CommonsenseQA is a new multiple-choice question answering dataset that requires\ndifferent types of commonsense knowledge to predict the correct answers . It\ncontains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits:\n\"Random split\" which is the main evaluation split, and \"Question token split\",\nsee paper for details.",
"languages": [
"english"
],
"tags": [
"commonsense",
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/1811.00937",
"dataset": "tau/commonsense_qa",
"name": "Commonsenseqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/coqa.py",
"module": "src/lighteval/tasks/tasks/coqa.py",
"abstract": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.",
"languages": [
"english"
],
"tags": [
"dialog",
"qa"
],
"paper": "https://arxiv.org/abs/1808.07042",
"dataset": "stanfordnlp/coqa",
"name": "Coqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/covid_dialogue.py",
"module": "src/lighteval/tasks/tasks/covid_dialogue.py",
"abstract": "The COVID-19 Dialogue dataset is a collection of 500+ dialogues between\ndoctors and patients during the COVID-19 pandemic.",
"languages": [
"english"
],
"tags": [
"dialog",
"medical"
],
"paper": "https://arxiv.org/abs/2004.06561",
"dataset": "lighteval/covid_dialogue",
"name": "Covid Dialogue"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py",
"module": "src/lighteval/tasks/tasks/custom_task_classification_grammar_task.py",
"abstract": "This task performs emotion classification classifying text into one of six\nemotion categories: sadness, joy, love, anger, fear, surprise.",
"languages": [
"english"
],
"tags": [
"emotion",
"classification",
"multiple-choice"
],
"paper": null,
"dataset": "dair-ai/emotion",
"name": "Emotion Classification"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/drop_qa.py",
"module": "src/lighteval/tasks/tasks/drop_qa.py",
"abstract": "The DROP dataset is a new question-answering dataset designed to evaluate the\nability of language models to answer complex questions that require reasoning\nover multiple sentences.",
"languages": [
"english"
],
"tags": [
"math",
"qa",
"reasoning"
],
"paper": "https://arxiv.org/abs/1810.00505",
"dataset": "lighteval/drop_harness",
"name": "Drop Qa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/dyck_language.py",
"module": "src/lighteval/tasks/tasks/dyck_language.py",
"abstract": "Scenario testing hierarchical reasoning through the Dyck formal languages.",
"languages": [
"english"
],
"tags": [
"reasoning"
],
"paper": "https://aclanthology.org/W19-3905/",
"dataset": "lighteval/DyckLanguage",
"name": "Dyck Language"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/entity_data_imputation.py",
"module": "src/lighteval/tasks/tasks/entity_data_imputation.py",
"abstract": "Scenario that tests the ability to impute missing entities in a data table.",
"languages": [
"english"
],
"tags": [
"reasoning"
],
"paper": "https://ieeexplore.ieee.org/document/9458712",
"dataset": "lighteval/Buy, lighteval/Restaurant",
"name": "Entity Data Imputation"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/entitymatching.py",
"module": "src/lighteval/tasks/tasks/entitymatching.py",
"abstract": "Simple entity matching benchmark.",
"languages": [
"english"
],
"tags": [
"classification",
"reasoning"
],
"paper": "https://dl.acm.org/doi/10.14778/3007263.3007314",
"dataset": "lighteval/EntityMatching",
"name": "Entitymatching"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/ethics.py",
"module": "src/lighteval/tasks/tasks/ethics.py",
"abstract": "The Ethics benchmark for evaluating the ability of language models to reason about\nethical issues.",
"languages": [
"english"
],
"tags": [
"classification",
"ethics",
"justice",
"morality",
"utilitarianism",
"virtue"
],
"paper": "https://arxiv.org/abs/2008.02275",
"dataset": "lighteval/hendrycks_ethics",
"name": "Ethics"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/glue.py",
"module": "src/lighteval/tasks/tasks/glue.py",
"abstract": "The General Language Understanding Evaluation (GLUE) benchmark is a collection\nof resources for training, evaluating, and analyzing natural language\nunderstanding systems.",
"languages": [
"english"
],
"tags": [
"classification",
"language-understanding"
],
"paper": null,
"dataset": "nyu-mll/glue, aps/super_glue",
"name": "GLUE"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/gpqa.py",
"module": "src/lighteval/tasks/tasks/gpqa.py",
"abstract": "GPQA is a dataset of 448 expert-written multiple-choice questions in biology,\nphysics, and chemistry, designed to test graduate-level reasoning. The questions\nare extremely difficult—PhD-level experts score about 65%, skilled non-experts\n34% (even with web access), and GPT-4 around 39%. GPQA aims to support research\non scalable oversight, helping humans evaluate and trust AI systems that may\nexceed human expertise.",
"languages": [
"english"
],
"tags": [
"biology",
"chemistry",
"graduate-level",
"multiple-choice",
"physics",
"qa",
"reasoning",
"science"
],
"paper": "https://arxiv.org/abs/2311.12022",
"dataset": "Idavidrein/gpqa",
"name": "Gpqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/gsm8k.py",
"module": "src/lighteval/tasks/tasks/gsm8k.py",
"abstract": "GSM8K is a dataset of 8,000+ high-quality, single-step arithmetic word problems.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2110.14168",
"dataset": "openai/gsm8k",
"name": "Gsm8K"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/gsm_plus.py",
"module": "src/lighteval/tasks/tasks/gsm_plus.py",
"abstract": "GSM-Plus is an adversarial extension of GSM8K that tests the robustness of LLMs'\nmathematical reasoning by introducing varied perturbations to grade-school math\nproblems.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2402.19255",
"dataset": "qintongli/GSM-Plus",
"name": "Gsm Plus"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/headqa.py",
"module": "src/lighteval/tasks/tasks/headqa.py",
"abstract": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to\naccess a specialized position in the Spanish healthcare system, and are\nchallenging even for highly specialized humans. They are designed by the\nMinisterio de Sanidad, Consumo y Bienestar Social, who also provides direct\naccess to the exams of the last 5 years.",
"languages": [
"english",
"spanish"
],
"tags": [
"health",
"medical",
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/1906.04701",
"dataset": "lighteval/headqa_harness",
"name": "Headqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/hellaswag.py",
"module": "src/lighteval/tasks/tasks/hellaswag.py",
"abstract": "HellaSwag is a commonsense inference benchmark designed to challenge language\nmodels with adversarially filtered multiple-choice questions.",
"languages": [
"english"
],
"tags": [
"multiple-choice",
"narrative",
"reasoning"
],
"paper": "https://arxiv.org/abs/1905.07830",
"dataset": "Rowan/hellaswag",
"name": "Hellaswag"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/hle/main.py",
"module": "src/lighteval/tasks/tasks/hle/main.py",
"abstract": "Humanity's Last Exam (HLE) is a global collaborative effort, with questions from\nnearly 1,000 subject expert contributors affiliated with over 500 institutions\nacross 50 countries - comprised mostly of professors, researchers, and graduate\ndegree holders.",
"languages": [
"english"
],
"tags": [
"qa",
"reasoning",
"general-knowledge"
],
"paper": "https://arxiv.org/abs/2501.14249",
"dataset": "cais/hle",
"name": "Humanity's Last Exam"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/ifbench/main.py",
"module": "src/lighteval/tasks/tasks/ifbench/main.py",
"abstract": "Challenging benchmark for precise instruction following.",
"languages": [
"english"
],
"tags": [
"instruction-following"
],
"paper": "https://arxiv.org/abs/2507.02833",
"dataset": "allenai/IFBench_test, allenai/IFBench_multi-turn",
"name": "IFBench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/ifeval/main.py",
"module": "src/lighteval/tasks/tasks/ifeval/main.py",
"abstract": "Very specific task where there are no precise outputs but instead we test if the\nformat obeys rules.",
"languages": [
"english"
],
"tags": [
"instruction-following"
],
"paper": "https://arxiv.org/abs/2311.07911",
"dataset": "google/IFEval",
"name": "IFEval"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/imdb.py",
"module": "src/lighteval/tasks/tasks/imdb.py",
"abstract": "The IMDB benchmark for sentiment analysis in movie review, from:\nLearning Word Vectors for Sentiment Analysis",
"languages": [
"english"
],
"tags": [
"classification"
],
"paper": "https://aclanthology.org/P11-1015/",
"dataset": "lighteval/IMDB_helm",
"name": "Imdb"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/jeopardy.py",
"module": "src/lighteval/tasks/tasks/jeopardy.py",
"abstract": "Jeopardy is a dataset of questions and answers from the Jeopardy game show.",
"languages": [
"english"
],
"tags": [
"knowledge",
"qa"
],
"paper": null,
"dataset": "openaccess-ai-collective/jeopardy",
"name": "Jeopardy"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/lambada.py",
"module": "src/lighteval/tasks/tasks/lambada.py",
"abstract": "LAMBADA is a benchmark for testing language models’ ability to understand broad\nnarrative context. Each passage requires predicting its final word—easy for\nhumans given the full passage but impossible from just the last sentence.\nSuccess demands long-range discourse comprehension.",
"languages": [
"english"
],
"tags": [
"language-modeling"
],
"paper": "https://arxiv.org/abs/1606.06031",
"dataset": "cimec/lambada",
"name": "Lambada"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/lcb/main.py",
"module": "src/lighteval/tasks/tasks/lcb/main.py",
"abstract": "LiveCodeBench collects problems from periodic contests on LeetCode, AtCoder, and\nCodeforces platforms and uses them for constructing a holistic benchmark for\nevaluating Code LLMs across variety of code-related scenarios continuously over\ntime.",
"languages": [
"english"
],
"tags": [
"code-generation"
],
"paper": "https://livecodebench.github.io/",
"dataset": "lighteval/code_generation_lite",
"name": "Live Code Bench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/legal_summarization.py",
"module": "src/lighteval/tasks/tasks/legal_summarization.py",
"abstract": "LegalSummarization is a dataset for legal summarization.",
"languages": [
"english"
],
"tags": [
"legal",
"summarization"
],
"paper": "https://arxiv.org/abs/2210.13448\nhttps://arxiv.org/abs/2210.13448",
"dataset": "lighteval/legal_summarization",
"name": "Legal Summarization"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/legalsupport.py",
"module": "src/lighteval/tasks/tasks/legalsupport.py",
"abstract": "Measures fine-grained legal reasoning through reverse entailment.",
"languages": [
"english"
],
"tags": [
"legal"
],
"paper": null,
"dataset": "lighteval/LegalSupport",
"name": "Legalsupport"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/lexglue.py",
"module": "src/lighteval/tasks/tasks/lexglue.py",
"abstract": "LexGLUE: A Benchmark Dataset for Legal Language Understanding in English",
"languages": [
"english"
],
"tags": [
"classification",
"legal"
],
"paper": "https://arxiv.org/abs/2110.00976",
"dataset": "lighteval/lexglue",
"name": "Lexglue"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/lextreme.py",
"module": "src/lighteval/tasks/tasks/lextreme.py",
"abstract": "LEXTREME: A Multi-Lingual and Multi-Task Benchmark for the Legal Domain",
"languages": [
"bulgarian",
"czech",
"danish",
"german",
"greek",
"english",
"spanish",
"estonian",
"finnish",
"french",
"ga",
"croatian",
"hungarian",
"italian",
"lithuanian",
"latvian",
"mt",
"dutch",
"polish",
"portuguese",
"romanian",
"slovak",
"slovenian",
"swedish"
],
"tags": [
"classification",
"legal"
],
"paper": "https://arxiv.org/abs/2301.13126",
"dataset": "lighteval/lextreme",
"name": "Lextreme"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/logiqa.py",
"module": "src/lighteval/tasks/tasks/logiqa.py",
"abstract": "LogiQA is a machine reading comprehension dataset focused on testing logical\nreasoning abilities. It contains 8,678 expert-written multiple-choice questions\ncovering various types of deductive reasoning. While humans perform strongly,\nstate-of-the-art models lag far behind, making LogiQA a benchmark for advancing\nlogical reasoning in NLP systems.",
"languages": [
"english"
],
"tags": [
"qa"
],
"paper": "https://arxiv.org/abs/2007.08124",
"dataset": "lighteval/logiqa_harness",
"name": "Logiqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/lsat_qa.py",
"module": "src/lighteval/tasks/tasks/lsat_qa.py",
"abstract": "Questions from law school admission tests.",
"languages": [
"english"
],
"tags": [
"legal",
"qa"
],
"paper": null,
"dataset": "lighteval/lsat_qa",
"name": "Lsat Qa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/math.py",
"module": "src/lighteval/tasks/tasks/math.py",
"abstract": "",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2305.20050",
"dataset": "DigitalLearningGmbH/MATH-lighteval",
"name": "Math"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/math_500.py",
"module": "src/lighteval/tasks/tasks/math_500.py",
"abstract": "This dataset contains a subset of 500 problems from the MATH benchmark that\nOpenAI created in their Let's Verify Step by Step paper.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": "https://arxiv.org/abs/2305.20050",
"dataset": "HuggingFaceH4/MATH-500",
"name": "Math 500"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mathqa.py",
"module": "src/lighteval/tasks/tasks/mathqa.py",
"abstract": "large-scale dataset of math word problems. Our dataset is gathered by using a\nnew representation language to annotate over the AQuA-RAT dataset with\nfully-specified operational programs. AQuA-RAT has provided the questions,\noptions, rationale, and the correct options.",
"languages": [
"english"
],
"tags": [
"math",
"qa",
"reasoning"
],
"paper": "https://arxiv.org/abs/1905.13319",
"dataset": "allenai/math_qa",
"name": "Mathqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/med.py",
"module": "src/lighteval/tasks/tasks/med.py",
"abstract": "A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering",
"languages": [
"english"
],
"tags": [
"health",
"medical"
],
"paper": "https://medmcqa.github.io/",
"dataset": "lighteval/med_mcqa, lighteval/med_paragraph_simplification, bigbio/med_qa",
"name": "Med"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/med_dialog.py",
"module": "src/lighteval/tasks/tasks/med_dialog.py",
"abstract": "A collection of medical dialogue datasets.",
"languages": [
"english"
],
"tags": [
"dialog",
"health",
"medical"
],
"paper": null,
"dataset": "lighteval/med_dialog",
"name": "Med Dialog"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mgsm.py",
"module": "src/lighteval/tasks/tasks/mgsm.py",
"abstract": "Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school\nmath problems.\nThe same 250 problems from GSM8K are each translated via human annotators in 10\nlanguages.",
"languages": [
"english",
"spanish",
"french",
"german",
"russian",
"chinese",
"japanese",
"thai",
"swahili",
"bengali",
"telugu"
],
"tags": [
"math",
"multilingual",
"reasoning"
],
"paper": "https://arxiv.org/abs/2210.03057",
"dataset": "juletxara/mgsm",
"name": "Mgsm"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mix_eval/main.py",
"module": "src/lighteval/tasks/tasks/mix_eval/main.py",
"abstract": "Ground-truth-based dynamic benchmark derived from off-the-shelf benchmark\nmixtures, which evaluates LLMs with a highly capable model ranking (i.e., 0.96\ncorrelation with Chatbot Arena) while running locally and quickly (6% the time\nand cost of running MMLU), with its queries being stably and effortlessly\nupdated every month to avoid contamination.",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"reasoning",
"qa"
],
"paper": "https://mixeval.github.io/",
"dataset": "MixEval/MixEval",
"name": "Mix Eval"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mmlu.py",
"module": "src/lighteval/tasks/tasks/mmlu.py",
"abstract": "MMMLU is a benchmark of general-knowledge and English language understanding.",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"knowledge",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2009.03300",
"dataset": "lighteval/mmlu",
"name": "Mmlu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mmlu_redux.py",
"module": "src/lighteval/tasks/tasks/mmlu_redux.py",
"abstract": "MMLU-Redux is a subset of 5,700 manually re-annotated questions across 57 MMLU subjects.",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"knowledge",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2406.04127",
"dataset": "edinburgh-dawg/mmlu-redux-2.0",
"name": "Mmlu Redux"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mmmu_pro.py",
"module": "src/lighteval/tasks/tasks/mmmu_pro.py",
"abstract": "",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"knowledge",
"multimodal",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/2409.02813",
"dataset": "MMMU/MMMU_pro",
"name": "Mmmu Pro"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/mt_bench/main.py",
"module": "src/lighteval/tasks/tasks/mt_bench/main.py",
"abstract": "MT-Bench is a multi-turn conversational benchmark for evaluating language\nmodels. It consists of 80 high-quality multi-turn questions across 8 common\ncategories (writing, roleplay, reasoning, math, coding, extraction, STEM,\nhumanities). Model responses are evaluated by a judge LLM.",
"languages": [
"english"
],
"tags": [
"conversational",
"generation",
"multi-turn"
],
"paper": "https://arxiv.org/abs/2402.14762",
"dataset": "lighteval/mt-bench",
"name": "Mt Bench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/musr.py",
"module": "src/lighteval/tasks/tasks/musr.py",
"abstract": "MuSR is a benchmark for evaluating multistep reasoning in natural language\nnarratives. Built using a neurosymbolic synthetic-to-natural generation process,\nit features complex, realistic tasks—such as long-form murder mysteries.",
"languages": [
"english"
],
"tags": [
"long-context",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/2310.16049",
"dataset": "TAUR-Lab/MuSR",
"name": "Musr"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/narrativeqa.py",
"module": "src/lighteval/tasks/tasks/narrativeqa.py",
"abstract": "NarrativeQA is a reading comprehension benchmark that tests deep understanding\nof full narratives—books and movie scripts—rather than shallow text matching. To\nanswer its questions, models must integrate information across entire stories.",
"languages": [
"english"
],
"tags": [
"qa",
"reading-comprehension"
],
"paper": "https://aclanthology.org/Q18-1023/",
"dataset": "lighteval/narrative_qa_helm",
"name": "Narrativeqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/natural_questions.py",
"module": "src/lighteval/tasks/tasks/natural_questions.py",
"abstract": "This dataset is a collection of question-answer pairs from the Natural Questions\ndataset. See Natural Questions for additional information. This dataset can be\nused directly with Sentence Transformers to train embedding models.",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"qa"
],
"paper": "https://ai.google.com/research/NaturalQuestions",
"dataset": "lighteval/small_natural_questions",
"name": "Natural Questions"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/numeracy.py",
"module": "src/lighteval/tasks/tasks/numeracy.py",
"abstract": "Numeracy is a benchmark for evaluating the ability of language models to reason about mathematics.",
"languages": [
"english"
],
"tags": [
"math",
"reasoning"
],
"paper": null,
"dataset": "lighteval/numeracy",
"name": "Numeracy"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/olympiade_bench/main.py",
"module": "src/lighteval/tasks/tasks/olympiade_bench/main.py",
"abstract": "OlympiadBench is a benchmark for evaluating the performance of language models\non olympiad problems.",
"languages": [
"english",
"chinese"
],
"tags": [
"math",
"reasoning",
"language"
],
"paper": "https://arxiv.org/abs/2402.14008",
"dataset": "Hothan/OlympiadBench",
"name": "Olympiade Bench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/openbookqa.py",
"module": "src/lighteval/tasks/tasks/openbookqa.py",
"abstract": "OpenBookQA is a question-answering dataset modeled after open-book exams for\nassessing human understanding of a subject. It contains multiple-choice\nquestions that require combining facts from a given open book with broad common\nknowledge. The task tests language models' ability to leverage provided\ninformation and apply common sense reasoning.",
"languages": [
"english"
],
"tags": [
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/1809.02789",
"dataset": "allenai/openbookqa",
"name": "Openbookqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/piqa.py",
"module": "src/lighteval/tasks/tasks/piqa.py",
"abstract": "PIQA is a benchmark for testing physical commonsense reasoning. It contains\nquestions requiring this kind of physical commonsense reasoning.",
"languages": [
"english"
],
"tags": [
"commonsense",
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/1911.11641",
"dataset": "ybisk/piqa",
"name": "Piqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/prost.py",
"module": "src/lighteval/tasks/tasks/prost.py",
"abstract": "PROST is a benchmark for testing physical reasoning about objects through space\nand time. It includes 18,736 multiple-choice questions covering 10 core physics\nconcepts, designed to probe models in zero-shot settings. Results show that even\nlarge pretrained models struggle with physical reasoning and are sensitive to\nquestion phrasing, underscoring their limited real-world understanding.",
"languages": [
"english"
],
"tags": [
"reasoning",
"qa",
"physical-commonsense"
],
"paper": "https://arxiv.org/abs/2106.03634",
"dataset": "lighteval/prost",
"name": "Prost"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/pubmedqa.py",
"module": "src/lighteval/tasks/tasks/pubmedqa.py",
"abstract": "PubMedQA is a dataset for biomedical research question answering.",
"languages": [
"english"
],
"tags": [
"biomedical",
"health",
"medical",
"qa"
],
"paper": "https://pubmedqa.github.io/",
"dataset": "pubmed_qa",
"name": "Pubmedqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/qa4mre.py",
"module": "src/lighteval/tasks/tasks/qa4mre.py",
"abstract": "QA4MRE is a machine reading comprehension benchmark from the CLEF 2011-2013\nchallenges. It evaluates systems' ability to answer questions requiring deep\nunderstanding of short texts, supported by external background knowledge.\nCovering tasks like modality, negation, biomedical reading, and entrance exams,\nQA4MRE tests reasoning beyond surface-level text matching.",
"languages": [
"english"
],
"tags": [
"biomedical",
"health",
"qa"
],
"paper": "https://link.springer.com/chapter/10.1007/978-3-642-40802-1_29",
"dataset": "qa4mre",
"name": "Qa4Mre"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/qasper.py",
"module": "src/lighteval/tasks/tasks/qasper.py",
"abstract": "QASPER is a dataset for question answering on scientific research papers. It\nconsists of 5,049 questions over 1,585 Natural Language Processing papers. Each\nquestion is written by an NLP practitioner who read only the title and abstract\nof the corresponding paper, and the question seeks information present in the\nfull text. The questions are then answered by a separate set of NLP\npractitioners who also provide supporting evidence to answers.",
"languages": [
"english"
],
"tags": [
"qa",
"scientific"
],
"paper": "https://arxiv.org/abs/2105.03011",
"dataset": "allenai/qasper",
"name": "Qasper"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/quac.py",
"module": "src/lighteval/tasks/tasks/quac.py",
"abstract": "The QuAC benchmark for question answering in the context of dialogues.",
"languages": [
"english"
],
"tags": [
"dialog",
"qa"
],
"paper": "https://aclanthology.org/D18-1241/",
"dataset": "lighteval/quac_helm",
"name": "Quac"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/race_high.py",
"module": "src/lighteval/tasks/tasks/race_high.py",
"abstract": "RACE is a large-scale reading comprehension dataset with more than 28,000\npassages and nearly 100,000 questions. The dataset is collected from English\nexaminations in China, which are designed for middle school and high school\nstudents. The dataset can be served as the training and test sets for machine\ncomprehension.",
"languages": [
"english"
],
"tags": [
"multiple-choice",
"reading-comprehension"
],
"paper": "https://aclanthology.org/D17-1082/",
"dataset": "EleutherAI/race",
"name": "Race High"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/raft.py",
"module": "src/lighteval/tasks/tasks/raft.py",
"abstract": "The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text\nclassification tasks.",
"languages": [
"english"
],
"tags": [
"classification",
"reasoning"
],
"paper": "https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html",
"dataset": "ought/raft",
"name": "Raft"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/real_toxicity_prompts.py",
"module": "src/lighteval/tasks/tasks/real_toxicity_prompts.py",
"abstract": "The RealToxicityPrompts dataset for measuring toxicity in prompted model generations",
"languages": [
"english"
],
"tags": [
"generation",
"safety"
],
"paper": "https://aclanthology.org/2020.findings-emnlp.301/",
"dataset": "allenai/real-toxicity-prompts",
"name": "Real Toxicity Prompts"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/sacrebleu.py",
"module": "src/lighteval/tasks/tasks/sacrebleu.py",
"abstract": "tasks from sacrebleu",
"languages": [
"english",
"german",
"french",
"japanese",
"korean",
"chinese",
"arabic"
],
"tags": [
"translation"
],
"paper": "https://github.com/mjpost/sacrebleu",
"dataset": "lighteval/sacrebleu_manual, wmt14, wmt16",
"name": "Sacrebleu"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/sciq.py",
"module": "src/lighteval/tasks/tasks/sciq.py",
"abstract": "The SciQ dataset contains 13,679 crowdsourced science exam questions about\nPhysics, Chemistry and Biology, among others. The questions are in\nmultiple-choice format with 4 answer options each. For the majority of the\nquestions, an additional paragraph with supporting evidence for the correct\nanswer is provided.",
"languages": [
"english"
],
"tags": [
"physics",
"chemistry",
"biology",
"reasoning",
"multiple-choice",
"qa"
],
"paper": "https://arxiv.org/abs/1707.06209",
"dataset": "allenai/sciq",
"name": "Sciq"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/simpleqa.py",
"module": "src/lighteval/tasks/tasks/simpleqa.py",
"abstract": "A factuality benchmark called SimpleQA that measures the ability for language\nmodels to answer short, fact-seeking questions.",
"languages": [
"english"
],
"tags": [
"factuality",
"general-knowledge",
"qa"
],
"paper": "https://openai.com/index/introducing-simpleqa/",
"dataset": "lighteval/SimpleQA",
"name": "Simpleqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/siqa.py",
"module": "src/lighteval/tasks/tasks/siqa.py",
"abstract": "We introduce Social IQa: Social Interaction QA, a new question-answering\nbenchmark for testing social commonsense intelligence. Contrary to many prior\nbenchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on\nreasoning about people's actions and their social implications. For example,\ngiven an action like \"Jesse saw a concert\" and a question like \"Why did Jesse do\nthis?\", humans can easily infer that Jesse wanted \"to see their favorite\nperformer\" or \"to enjoy the music\", and not \"to see what's happening inside\" or\n\"to see if it works\". The actions in Social IQa span a wide variety of social\nsituations, and answer candidates contain both human-curated answers and\nadversarially-filtered machine-generated candidates. Social IQa contains over\n37,000 QA pairs for evaluating models' abilities to reason about the social\nimplications of everyday events and situations.",
"languages": [
"english"
],
"tags": [
"commonsense",
"multiple-choice",
"qa"
],
"paper": null,
"dataset": "allenai/social_i_qa",
"name": "Siqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/slr_bench.py",
"module": "src/lighteval/tasks/tasks/slr_bench.py",
"abstract": "SLR-Bench is a large-scale benchmark for scalable logical reasoning with\nlanguage models, comprising 19,000 prompts organized into 20 curriculum levels.",
"languages": [
"english"
],
"tags": [
"reasoning",
"symbolic"
],
"paper": "https://arxiv.org/abs/2506.15787",
"dataset": "AIML-TUDA/SLR-Bench",
"name": "SLR-Bench"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/squad_v2.py",
"module": "src/lighteval/tasks/tasks/squad_v2.py",
"abstract": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,\nconsisting of questions posed by crowdworkers on a set of Wikipedia articles,\nwhere the answer to every question is a segment of text, or span, from the\ncorresponding reading passage, or the question might be unanswerable.\nSQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000\nunanswerable questions written adversarially by crowdworkers to look similar to\nanswerable ones. To do well on SQuAD2.0, systems must not only answer questions\nwhen possible, but also determine when no answer is supported by the paragraph\nand abstain from answering.",
"languages": [
"english"
],
"tags": [
"qa"
],
"paper": "https://arxiv.org/abs/1806.03822",
"dataset": "rajpurkar/squad_v2",
"name": "Squad V2"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/storycloze.py",
"module": "src/lighteval/tasks/tasks/storycloze.py",
"abstract": "A Corpus and Cloze Evaluation for Deeper Understanding of\nCommonsense Stories",
"languages": [
"english"
],
"tags": [
"narrative",
"reasoning"
],
"paper": "https://arxiv.org/abs/1604.01696",
"dataset": "MoE-UNC/story_cloze",
"name": "Storycloze"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/summarization.py",
"module": "src/lighteval/tasks/tasks/summarization.py",
"abstract": "Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural\nNetworks for Extreme Summarization and: Abstractive Text Summarization using\nSequence-to-sequence RNNs and Beyond",
"languages": [
"english"
],
"tags": [
"summarization"
],
"paper": "https://aclanthology.org/D18-1206/\nhttps://aclanthology.org/K16-1028/",
"dataset": "lighteval/summarization",
"name": "Summarization"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/swag.py",
"module": "src/lighteval/tasks/tasks/swag.py",
"abstract": "The dataset consists of 113k multiple choice questions about grounded situations\n(73k training, 20k validation, 20k test). Each question is a video caption from\nLSMDC or ActivityNet Captions, with four answer choices about what might happen\nnext in the scene. The correct answer is the (real) video caption for the next\nevent in the video; the three incorrect answers are adversarially generated and\nhuman verified, so as to fool machines but not humans. SWAG aims to be a\nbenchmark for evaluating grounded commonsense NLI and for learning\nrepresentations.",
"languages": [
"english"
],
"tags": [
"narrative",
"reasoning"
],
"paper": "https://arxiv.org/abs/1808.05326",
"dataset": "allenai/swag",
"name": "Swag"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/synthetic_reasoning.py",
"module": "src/lighteval/tasks/tasks/synthetic_reasoning.py",
"abstract": "LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning",
"languages": [
"english"
],
"tags": [
"reasoning"
],
"paper": "https://arxiv.org/abs/2206.03855",
"dataset": "lighteval/synthetic_reasoning, lighteval/synthetic_reasoning_natural",
"name": "Synthetic Reasoning"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/the_pile.py",
"module": "src/lighteval/tasks/tasks/the_pile.py",
"abstract": "The Pile corpus for measuring lanugage model performance across various domains.",
"languages": [
"english"
],
"tags": [
"language-modeling"
],
"paper": "https://arxiv.org/abs/2101.00027",
"dataset": "lighteval/pile_helm",
"name": "The Pile"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/tiny_benchmarks/main.py",
"module": "src/lighteval/tasks/tasks/tiny_benchmarks/main.py",
"abstract": "TinyBenchmarks is a benchmark for evaluating the performance of language models\non tiny benchmarks.",
"languages": [
"english"
],
"tags": [
"general-knowledge",
"reasoning",
"qa"
],
"paper": "https://arxiv.org/abs/2402.14992",
"dataset": "tinyBenchmarks/tinyWinogrande, tinyBenchmarks/tinyAI2_arc,\ntinyBenchmarks/tinyHellaswag, tinyBenchmarks/tinyMMLU,\ntinyBenchmarks/tinyTruthfulQA, tinyBenchmarks/tinyGSM8k",
"name": "Tiny Benchmarks"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/toxigen.py",
"module": "src/lighteval/tasks/tasks/toxigen.py",
"abstract": "This dataset is for implicit hate speech detection. All instances were generated\nusing GPT-3 and the methods described in our paper.",
"languages": [
"english"
],
"tags": [
"generation",
"safety"
],
"paper": "https://arxiv.org/abs/2203.09509",
"dataset": "skg/toxigen-data",
"name": "Toxigen"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/triviaqa.py",
"module": "src/lighteval/tasks/tasks/triviaqa.py",
"abstract": "TriviaqQA is a reading comprehension dataset containing over 650K\nquestion-answer-evidence triples. TriviaqQA includes 95K question-answer pairs\nauthored by trivia enthusiasts and independently gathered evidence documents,\nsix per question on average, that provide high quality distant supervision for\nanswering the questions.",
"languages": [
"english"
],
"tags": [
"qa"
],
"paper": "https://arxiv.org/abs/1705.03551",
"dataset": "mandarjoshi/trivia_qa",
"name": "Triviaqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/truthfulqa.py",
"module": "src/lighteval/tasks/tasks/truthfulqa.py",
"abstract": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
"languages": [
"english"
],
"tags": [
"factuality",
"qa"
],
"paper": "https://arxiv.org/abs/2109.07958",
"dataset": "EleutherAI/truthful_qa_mc",
"name": "Truthfulqa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/twitterAAE.py",
"module": "src/lighteval/tasks/tasks/twitterAAE.py",
"abstract": "Demographic Dialectal Variation in Social Media: A Case Study of African-American English",
"languages": [
"english"
],
"tags": [
"language-modeling"
],
"paper": "https://aclanthology.org/D16-1120/",
"dataset": "lighteval/twitterAAE",
"name": "Twitteraae"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/unscramble.py",
"module": "src/lighteval/tasks/tasks/unscramble.py",
"abstract": "Benchmark where we ask the model to unscramble a word, either anagram or\nrandom insertion.",
"languages": [
"english"
],
"tags": [
"language-modeling",
"reasoning"
],
"paper": "https://huggingface.co/datasets/lighteval/GPT3_unscramble",
"dataset": "lighteval/GPT3_unscramble",
"name": "Unscramble"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/webqs.py",
"module": "src/lighteval/tasks/tasks/webqs.py",
"abstract": "This dataset consists of 6,642 question/answer pairs. The questions are supposed\nto be answerable by Freebase, a large knowledge graph. The questions are mostly\ncentered around a single named entity. The questions are popular ones asked on\nthe web.",
"languages": [
"english"
],
"tags": [
"qa"
],
"paper": "https://aclanthology.org/D13-1160.pdf",
"dataset": "stanfordnlp/web_questions",
"name": "Webqs"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/wikifact.py",
"module": "src/lighteval/tasks/tasks/wikifact.py",
"abstract": "Extensively test factual knowledge.",
"languages": [
"english"
],
"tags": [
"factuality",
"knowledge"
],
"paper": "https://aclanthology.org/D19-1250/",
"dataset": "lighteval/wikifact",
"name": "Wikifact"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/wikitext.py",
"module": "src/lighteval/tasks/tasks/wikitext.py",
"abstract": "The WikiText language modeling dataset is a collection of over 100 million\ntokens extracted from the set of verified Good and Featured articles on\nWikipedia. The dataset is available under the Creative Commons\nAttribution-ShareAlike License.",
"languages": [
"english"
],
"tags": [
"language-modeling"
],
"paper": "https://arxiv.org/abs/1609.07843",
"dataset": "EleutherAI/wikitext_document_level",
"name": "Wikitext"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/winogrande.py",
"module": "src/lighteval/tasks/tasks/winogrande.py",
"abstract": "WinoGrande is a new collection of 44k problems, inspired by Winograd Schema\nChallenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the\nscale and robustness against the dataset-specific bias. Formulated as a\nfill-in-a-blank task with binary options, the goal is to choose the right option\nfor a given sentence which requires commonsense reasoning.",
"languages": [
"english"
],
"tags": [
"commonsense",
"multiple-choice"
],
"paper": "https://arxiv.org/abs/1907.10641",
"dataset": "allenai/winogrande",
"name": "Winogrande"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/xcopa.py",
"module": "src/lighteval/tasks/tasks/xcopa.py",
"abstract": "XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning The Cross-lingual\nChoice of Plausible Alternatives dataset is a benchmark to evaluate the ability\nof machine learning models to transfer commonsense reasoning across languages.",
"languages": [
"english"
],
"tags": [
"commonsense",
"multilingual",
"multiple-choice",
"reasoning"
],
"paper": "https://arxiv.org/abs/2005.00333",
"dataset": "cambridgeltl/xcopa",
"name": "Xcopa"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/xstory_cloze.py",
"module": "src/lighteval/tasks/tasks/xstory_cloze.py",
"abstract": "XStoryCloze consists of the professionally translated version of the English\nStoryCloze dataset (Spring 2016 version) to 10 non-English languages. This\ndataset is released by Meta AI.",
"languages": [
"english",
"russian",
"chinese",
"spanish",
"arabic",
"hindi",
"indonesian",
"telugu",
"swahili",
"basque",
"burmese"
],
"tags": [
"multilingual",
"narrative",
"reasoning"
],
"paper": null,
"dataset": "juletxara/xstory_cloze",
"name": "Xstory Cloze"
},
{
"file_path": "/Users/nathan/Repos/lighteval-dev-1/src/lighteval/tasks/tasks/xwinograd.py",
"module": "src/lighteval/tasks/tasks/xwinograd.py",
"abstract": "Multilingual winograd schema challenge as used in Crosslingual Generalization through Multitask Finetuning.",
"languages": [
"english",
"french",
"japanese",
"portuguese",
"russian",
"chinese"
],
"tags": [
"commonsense",
"multilingual",
"reasoning"
],
"paper": "https://arxiv.org/abs/2211.01786",
"dataset": "Muennighoff/xwinograd",
"name": "Xwinograd"
}
],
"languages": [
"english",
"chinese",
"arabic",
"french",
"russian",
"spanish",
"german",
"hindi",
"swahili",
"portuguese",
"italian",
"telugu",
"thai",
"bengali",
"japanese",
"vietnamese",
"tamil",
"turkish",
"indonesian",
"kannada",
"malayalam",
"dutch",
"gujarati",
"marathi",
"serbian",
"hungarian",
"romanian",
"croatian",
"danish",
"greek",
"korean",
"polish",
"urdu",
"nepali",
"slovak",
"swedish",
"ukrainian",
"yoruba",
"amharic",
"armenian",
"assamese",
"basque",
"bulgarian",
"catalan",
"hebrew",
"norwegian",
"punjabi",
"zulu",
"ewe",
"finnish",
"hausa",
"igbo",
"khmer",
"kinyarwanda",
"lingala",
"luganda",
"oriya",
"oromo",
"shona",
"sotho",
"twi",
"wolof",
"xhosa",
"burmese",
"chinese (simplified)",
"chinese (traditional)",
"cyrillic",
"czech",
"devanagari",
"estonian",
"ethiopic",
"georgian",
"gurmukhi",
"hangul",
"icelandic",
"lao",
"latin",
"lithuanian",
"malay",
"myanmar",
"odia",
"sanskrit",
"sinhala",
"tibetan",
"afrikaans",
"albanian",
"chinese_hong_kong",
"chinese_traditional",
"filipino",
"ga",
"haitian",
"javanese",
"latvian",
"macedonian",
"mt",
"quechua",
"sindhi",
"slovenian",
"turkic"
],
"tags": [
"bias",
"biology",
"biomedical",
"chemistry",
"classification",
"code-generation",
"commonsense",
"conversational",
"dialog",
"emotion",
"ethics",
"factuality",
"general-knowledge",
"generation",
"geography",
"graduate-level",
"health",
"history",
"instruction-following",
"justice",
"knowledge",
"language",
"language-modeling",
"language-understanding",
"legal",
"long-context",
"math",
"medical",
"morality",
"multi-turn",
"multilingual",
"multimodal",
"multiple-choice",
"narrative",
"nli",
"physical-commonsense",
"physics",
"qa",
"reading-comprehension",
"reasoning",
"safety",
"science",
"scientific",
"summarization",
"symbolic",
"translation",
"utilitarianism",
"virtue"
]
}