Spaces:
Paused
Paused
| from tests.utils import wrap_test_forked | |
| def test_bleurt(): | |
| predictions = ["hello there", "general kenobi"] | |
| references = ["hello there", "general kenobi"] | |
| import evaluate | |
| bleurt = evaluate.load("bleurt") | |
| results = bleurt.compute(predictions=predictions, references=references) | |
| assert [round(v, 2) for v in results["scores"]] == [1.03, 1.04] | |
| def test_sacrebleu(): | |
| predictions = ["hello there general kenobi", "foo bar foobar"] | |
| references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] | |
| import evaluate | |
| sacrebleu = evaluate.load("sacrebleu") | |
| results = sacrebleu.compute(predictions=predictions, references=references) | |
| assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] | |
| assert round(results["score"], 1) == 100.0 | |
| predictions = ["hello there general kenobi", "on our way to ankh morpork"] | |
| references = [["hello there general kenobi", "hello there !"], ["goodbye ankh morpork", "ankh morpork"]] | |
| sacrebleu = evaluate.load("sacrebleu") | |
| results = sacrebleu.compute(predictions=predictions, references=references) | |
| assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len'] | |
| assert round(results["score"], 1) == 39.8 | |
| def test_bleu(): | |
| predictions = ["hello there general kenobi", "foo bar foobar"] | |
| references = [ | |
| ["hello there general kenobi", "hello there!"], | |
| ["foo bar foobar"] | |
| ] | |
| import evaluate | |
| bleu = evaluate.load("bleu") | |
| results = bleu.compute(predictions=predictions, references=references) | |
| assert results["bleu"] == 1.0 | |
| def test_squad_v1(): | |
| predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}] | |
| references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}] | |
| import evaluate | |
| squad_metric = evaluate.load("squad") | |
| results = squad_metric.compute(predictions=predictions, references=references) | |
| assert results == {'exact_match': 100.0, 'f1': 100.0} | |
| def test_squad_v2(): | |
| predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}] | |
| references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}] | |
| import evaluate | |
| squad_v2_metric = evaluate.load("squad_v2") | |
| results = squad_v2_metric.compute(predictions=predictions, references=references) | |
| assert results == {'exact': 100.0, 'f1': 100.0, 'total': 1, 'HasAns_exact': 100.0, 'HasAns_f1': 100.0, | |
| 'HasAns_total': 1, 'best_exact': 100.0, 'best_exact_thresh': 0.0, 'best_f1': 100.0, | |
| 'best_f1_thresh': 0.0} | |
| def test_rougue(): | |
| import evaluate | |
| rouge = evaluate.load('rouge') | |
| predictions = ["hello there", "general kenobi"] | |
| references = ["hello there", "general kenobi"] | |
| results = rouge.compute(predictions=predictions, references=references) | |
| assert results == {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0} | |
| def test_bertscore(): | |
| predictions = ["hello there", "general kenobi"] | |
| references = ["hello there", "general kenobi"] | |
| import evaluate | |
| bertscore = evaluate.load("bertscore") | |
| results = bertscore.compute(predictions=predictions, references=references, lang="en") | |
| assert [round(v, 2) for v in results["f1"]] == [1.0, 1.0] | |
| def test_chrf(): | |
| prediction = ["The relationship between cats and dogs is not exactly friendly.", | |
| "a good bookshop is just a genteel black hole that knows how to read."] | |
| reference = [["The relationship between dogs and cats is not exactly friendly.", ], | |
| ["A good bookshop is just a genteel Black Hole that knows how to read."]] | |
| import evaluate | |
| chrf = evaluate.load("chrf") | |
| results = chrf.compute(predictions=prediction, references=reference) | |
| assert results == {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2} | |
| def test_chrfpp(): | |
| prediction = ["The relationship between cats and dogs is not exactly friendly.", | |
| "a good bookshop is just a genteel black hole that knows how to read."] | |
| reference = [["The relationship between dogs and cats is not exactly friendly.", ], | |
| ["A good bookshop is just a genteel Black Hole that knows how to read."]] | |
| import evaluate | |
| chrf = evaluate.load("chrf") | |
| results = chrf.compute(predictions=prediction, references=reference, word_order=2) | |
| assert results == {'beta': 2, 'char_order': 6, 'score': 82.87263732906315, 'word_order': 2} | |
| def test_wiki_split(): | |
| sources = ["About 95 species are currently accepted ."] | |
| predictions = ["About 95 you now get in ."] | |
| references = [["About 95 species are currently known ."]] | |
| import evaluate | |
| wiki_split = evaluate.load("wiki_split") | |
| results = wiki_split.compute(sources=sources, predictions=predictions, references=references) | |
| assert results == {'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0} | |
| def test_super_glue(): | |
| from evaluate import load | |
| # https://huggingface.co/datasets/boolq | |
| # passage, question, answer (as bool only though, but can ask LLM to only say true or false) | |
| super_glue_metric = load('super_glue', 'boolq') # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"] | |
| predictions = [0, 1] | |
| references = [0, 1] | |
| results = super_glue_metric.compute(predictions=predictions, references=references) | |
| assert results == {'accuracy': 1.0} | |
| def test_quip(): | |
| from metrics.quip import Quip | |
| quip = Quip() | |
| predictions = ["Kathy's hair is green according to the first passage."] | |
| references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.", | |
| "Alice went to the county fair.", "George is reading a newspaper."]] | |
| results = quip.compute(predictions=predictions, references=references) | |
| print(results) | |
| assert results == 0.16666666666666663 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.", | |
| "Alice went to the county fair.", "George is reading a newspaper."]] | |
| results = quip.compute(predictions=predictions, references=references) | |
| print(results) | |
| assert results == 0.0 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [["chuck", "wood"]] | |
| results = quip.compute(predictions=predictions, references=references) | |
| print(results) | |
| assert results == 0.0 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [["chuck", "woodchuck"]] | |
| results = quip.compute(predictions=predictions, references=references) | |
| print(results) | |
| assert results == 0.0 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [["chuck", "woodchuck"]] | |
| results = quip.compute(predictions=predictions, references=references, min_len=1) | |
| print(results) | |
| assert results == 0.09523809523809523 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [["woodchuck chuck", "chuck"]] | |
| results = quip.compute(predictions=predictions, references=references) | |
| print(results) | |
| assert results == 0.05882352941176472 | |
| predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."] | |
| references = [[ | |
| "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]] | |
| results = quip.compute(predictions=predictions, references=references, min_len=1) | |
| print(results) | |
| assert results == 0.33333333333333337 | |
| predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."] | |
| references = [[ | |
| "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]] | |
| results = quip.compute(predictions=predictions, references=references, return_match_count=True) | |
| print(results) | |
| assert results == 4 | |
| predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."] | |
| references = [[ | |
| "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]] | |
| results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True) | |
| print(results) | |
| assert results == 0.5 | |
| predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"] | |
| references = [[ | |
| "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]] | |
| results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True) | |
| print(results) | |
| assert results == 0.0 | |
| def test_glue(): | |
| # entailment | |
| """ | |
| E.g. for qnli: | |
| The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs, | |
| where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding | |
| question (written by an annotator). The authors of the benchmark convert the task into sentence pair | |
| classification by forming a pair between each question and each sentence in the corresponding context, | |
| and filtering out pairs with low lexical overlap between the question and the context sentence. | |
| The task is to determine whether the context sentence contains the answer to the question. | |
| This modified version of the original task removes the requirement that the model select the exact answer, | |
| but also removes the simplifying assumptions that the answer is always present in the input | |
| and that lexical overlap is a reliable cue. | |
| :return: | |
| """ | |
| from evaluate import load | |
| glue_metric = load('glue', 'qnli') | |
| references = [0, 1] | |
| predictions = [1, 1] | |
| results = glue_metric.compute(predictions=predictions, references=references) | |
| assert results == {'accuracy': 0.5} | |
| def test_google_bleu(): | |
| sentence1 = "the cat sat on the mat" | |
| sentence2 = "the cat ate the mat" | |
| import evaluate | |
| google_bleu = evaluate.load("google_bleu") | |
| result = google_bleu.compute(predictions=[sentence1], references=[[sentence2]]) | |
| assert result == {'google_bleu': 0.3333333333333333} | |
| predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', | |
| 'he read the book because he was interested in world history'] | |
| references = [ | |
| ['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', | |
| 'It is a guide to action that ensures that the rubber duck will never heed the cat commands', | |
| 'It is the practical guide for the rubber duck army never to heed the directions of the cat'], | |
| ['he was interested in world history because he read the book']] | |
| google_bleu = evaluate.load("google_bleu") | |
| results = google_bleu.compute(predictions=predictions, references=references, min_len=2, max_len=6) | |
| assert round(results["google_bleu"], 2) == 0.4 | |
| def test_meteor(): | |
| import evaluate | |
| meteor = evaluate.load('meteor') | |
| predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"] | |
| references = [['It is a guide to action that ensures that the military will forever heed Party commands', | |
| 'It is the guiding principle which guarantees the military forces always being under the command of the Party', | |
| 'It is the practical guide for the army always to heed the directions of the party']] | |
| results = meteor.compute(predictions=predictions, references=references) | |
| assert round(results['meteor'], 2) == 0.69 | |
| predictions = ["Kathy's hair is green according to the first passage."] | |
| references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.", | |
| "Alice went to the county fair.", "George is reading a newspaper."]] | |
| results = meteor.compute(predictions=predictions, references=references) | |
| assert results == {'meteor': 0.9059829059829061} | |
| print(results) | |