Spaces:

Intel
/

adversarial_glue

Running

App Files Files Community

tybrs commited on Dec 15, 2023

Commit

b9e00cb

1 Parent(s): 4b8cc84

Update Space (evaluate main: 1a12c674)

Browse files

Files changed (2) hide show

README.md +3 -4
adversarial_glue.py +30 -60

README.md CHANGED Viewed

@@ -38,7 +38,7 @@ mc_results,  = suite.run("gpt2")
 The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
-`accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
 ### Values from popular papers
@@ -47,14 +47,14 @@ The [original GLUE paper](https://huggingface.co/datasets/glue) reported average
 For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
-## Examples
 For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
 ## Limitations and bias
 This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
-While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
 ## Citation
@@ -66,4 +66,3 @@ While the GLUE dataset is meant to represent "General Language Understanding", t
   year={2021}
 }
 ```

 The output of the metric depends on the GLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
+`accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
 ### Values from popular papers
 For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/glue).
+## Examples
 For full example see [HF Evaluate Adversarial Attacks.ipynb](https://github.com/IntelAI/evaluate/blob/develop/notebooks/HF%20Evaluate%20Adversarial%20Attacks.ipynb)
 ## Limitations and bias
 This metric works only with datasets that have the same format as the [GLUE dataset](https://huggingface.co/datasets/glue).
+While the GLUE dataset is meant to represent "General Language Understanding", the tasks represented in it are not necessarily representative of language understanding, and should not be interpreted as such.
 ## Citation
   year={2021}
 }
 ```

adversarial_glue.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from evaluate.evaluation_suite import SubTask
 from evaluate.visualization import radar_plot
-from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
 _HEADER = "GLUE/AdvGlue Evaluation Results"
@@ -27,11 +28,8 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
-                    "label_mapping": {
-                        "LABEL_0": 0.0,
-                        "LABEL_1": 1.0
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -43,29 +41,22 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
-                    "label_mapping": {
-                        "LABEL_0": 0.0,
-                        "LABEL_1": 1.0
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
                 data="glue",
                 subset="qqp",
                 split="validation[:5]",
                 args_for_task={
                     "metric": "glue",
                     "input_column": "question1",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -78,11 +69,8 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -95,11 +83,8 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -112,11 +97,8 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -129,11 +111,8 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -146,11 +125,8 @@ class Suite(ModelCardSuiteResults):
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -162,12 +138,8 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1,
-                        "LABEL_2": 2
-                    }
-                }
             ),
             SubTask(
                 task_type="text-classification",
@@ -179,24 +151,22 @@ class Suite(ModelCardSuiteResults):
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
-                    "label_mapping": {
-                        "LABEL_0": 0,
-                        "LABEL_1": 1,
-                        "LABEL_2": 2
-                    }
-                }
             ),
         ]
     def process_results(self, results):
         radar_data = [
-            {"accuracy " + result["task_name"].split("/")[-1]:
-             result["accuracy"] for result in results[::2]},
-            {"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]:
-             result["accuracy"] for result in results[1::2]}]
-        return radar_plot(radar_data, ['GLUE', 'AdvGLUE'])
     def plot_results(self, results, model_or_pipeline):
         radar_data = self.process_results(results)
-        graphic = radar_plot(radar_data, ['GLUE ' + model_or_pipeline,  'AdvGLUE ' + model_or_pipeline])
         return graphic

+from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults
 from evaluate.evaluation_suite import SubTask
 from evaluate.visualization import radar_plot
 _HEADER = "GLUE/AdvGlue Evaluation Results"
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
+                    "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "sentence",
                     "label_column": "label",
                     "config_name": "sst2",
+                    "label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                 data="glue",
                 subset="qqp",
                 split="validation[:5]",
                 args_for_task={
                     "metric": "glue",
                     "input_column": "question1",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "question2",
                     "label_column": "label",
                     "config_name": "qqp",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence",
                     "label_column": "label",
                     "config_name": "qnli",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "second_input_column": "sentence2",
                     "label_column": "label",
                     "config_name": "rte",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
+                },
             ),
             SubTask(
                 task_type="text-classification",
                     "input_column": "premise",
                     "second_input_column": "hypothesis",
                     "config_name": "mnli",
+                    "label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
+                },
             ),
         ]
     def process_results(self, results):
         radar_data = [
+            {"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]},
+            {
+                "accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"]
+                for result in results[1::2]
+            },
+        ]
+        return radar_plot(radar_data, ["GLUE", "AdvGLUE"])
     def plot_results(self, results, model_or_pipeline):
         radar_data = self.process_results(results)
+        graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline])
         return graphic