Spaces:

autoevaluate
/

model-evaluator

Runtime error

App Files Files Community

TristanThrush

mathemakitten

lewtun HF Staff helen commited on Sep 12, 2022

Commit

d68b7d5

unverified ·

1 Parent(s): fe69431

add zero shot classification task (#45)

Browse files

* add zero shot classification task

* fix default metric list for zero shot classification

* Update enum

* Rename to text_zero_shot_classification

* Merge conflict

* Lewis refactor incorporate

* Adhere to Hub naming conventions

* Incorporate Autotrain changes for deprecated data endpoint

* Sagemaker update

* Sagemaker changes

Co-authored-by: mathemakitten <helen.ngo14@gmail.com>
Co-authored-by: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Co-authored-by: helen <31600291+mathemakitten@users.noreply.github.com>

Files changed (2) hide show

app.py +51 -8
utils.py +8 -4

app.py CHANGED Viewed

@@ -43,6 +43,7 @@ TASK_TO_ID = {
     "extractive_question_answering": 5,
     "translation": 6,
     "summarization": 8,
 }
 TASK_TO_DEFAULT_METRICS = {
@@ -65,6 +66,7 @@ TASK_TO_DEFAULT_METRICS = {
         "recall",
         "accuracy",
     ],
 }
 AUTOTRAIN_TASK_TO_LANG = {
@@ -73,6 +75,8 @@ AUTOTRAIN_TASK_TO_LANG = {
     "image_multi_class_classification": "unk",
 }
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
@@ -273,6 +277,45 @@ with st.expander("Advanced configuration"):
             col_mapping[text_col] = "text"
             col_mapping[target_col] = "target"
     if selected_task in ["natural_language_inference"]:
         config_metadata = get_config_metadata(selected_config, metadata)
         with col1:
@@ -533,8 +576,10 @@ with st.form(key="form"):
                         else "en",
                         "max_models": 5,
                         "instance": {
-                            "provider": "aws",
-                            "instance_type": "ml.g4dn.4xlarge",
                             "max_runtime_seconds": 172800,
                             "num_instances": 1,
                             "disk_size_gb": 150,
@@ -560,17 +605,15 @@ with st.form(key="form"):
                         "split": 4,  # use "auto" split choice in AutoTrain
                         "col_mapping": col_mapping,
                         "load_config": {"max_size_bytes": 0, "shuffle": False},
                     }
                     data_json_resp = http_post(
-                        path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}",
                         payload=data_payload,
                         token=HF_TOKEN,
                         domain=AUTOTRAIN_BACKEND_API,
-                        params={
-                            "type": "dataset",
-                            "config_name": selected_config,
-                            "split_name": selected_split,
-                        },
                     ).json()
                     print(f"INFO -- Dataset creation response: {data_json_resp}")
                     if data_json_resp["download_status"] == 1:

     "extractive_question_answering": 5,
     "translation": 6,
     "summarization": 8,
+    "text_zero_shot_classification": 23,
 }
 TASK_TO_DEFAULT_METRICS = {
         "recall",
         "accuracy",
     ],
+    "text_zero_shot_classification": ["accuracy", "loss"],
 }
 AUTOTRAIN_TASK_TO_LANG = {
     "image_multi_class_classification": "unk",
 }
+AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
 SUPPORTED_TASKS = list(TASK_TO_ID.keys())
             col_mapping[text_col] = "text"
             col_mapping[target_col] = "target"
+    elif selected_task == "text_zero_shot_classification":
+        with col1:
+            st.markdown("`text` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`classes` column")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.text("")
+            st.markdown("`target` column")
+        with col2:
+            text_col = st.selectbox(
+                "This column should contain the text to be classified",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
+                if config_metadata is not None
+                else 0,
+            )
+            classes_col = st.selectbox(
+                "This column should contain the classes associated with the text",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
+                if config_metadata is not None
+                else 0,
+            )
+            target_col = st.selectbox(
+                "This column should contain the index of the correct class",
+                col_names,
+                index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
+                if config_metadata is not None
+                else 0,
+            )
+            col_mapping[text_col] = "text"
+            col_mapping[classes_col] = "classes"
+            col_mapping[target_col] = "target"
     if selected_task in ["natural_language_inference"]:
         config_metadata = get_config_metadata(selected_config, metadata)
         with col1:
                         else "en",
                         "max_models": 5,
                         "instance": {
+                            "provider": "sagemaker",
+                            "instance_type": AUTOTRAIN_MACHINE[selected_task]
+                            if selected_task in AUTOTRAIN_MACHINE.keys()
+                            else "p3",
                             "max_runtime_seconds": 172800,
                             "num_instances": 1,
                             "disk_size_gb": 150,
                         "split": 4,  # use "auto" split choice in AutoTrain
                         "col_mapping": col_mapping,
                         "load_config": {"max_size_bytes": 0, "shuffle": False},
+                        "dataset_id": selected_dataset,
+                        "dataset_config": selected_config,
+                        "dataset_split": selected_split,
                     }
                     data_json_resp = http_post(
+                        path=f"/projects/{project_json_resp['id']}/data/dataset",
                         payload=data_payload,
                         token=HF_TOKEN,
                         domain=AUTOTRAIN_BACKEND_API,
                     ).json()
                     print(f"INFO -- Dataset creation response: {data_json_resp}")
                     if data_json_resp["download_status"] == 1:

utils.py CHANGED Viewed

@@ -19,6 +19,7 @@ AUTOTRAIN_TASK_TO_HUB_TASK = {
     "summarization": "summarization",
     "image_binary_classification": "image-classification",
     "image_multi_class_classification": "image-classification",
 }
@@ -82,7 +83,8 @@ def get_compatible_models(task: str, dataset_ids: List[str]) -> List[str]:
     """
     compatible_models = []
     # Allow any summarization model to be used for summarization tasks
-    if task == "summarization":
         model_filter = ModelFilter(
             task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
             library=["transformers", "pytorch"],
@@ -195,9 +197,11 @@ def create_autotrain_project_name(dataset_id: str, dataset_config: str) -> str:
     """Creates an AutoTrain project name for the given dataset ID."""
     # Project names cannot have "/", so we need to format community datasets accordingly
     dataset_id_formatted = dataset_id.replace("/", "__")
-    # Project names need to be unique, so we append a random string to guarantee this
-    project_id = str(uuid.uuid4())[:6]
-    return f"eval-{dataset_id_formatted}-{dataset_config}-{project_id}"
 def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]:

     "summarization": "summarization",
     "image_binary_classification": "image-classification",
     "image_multi_class_classification": "image-classification",
+    "text_zero_shot_classification": "text-generation",
 }
     """
     compatible_models = []
     # Allow any summarization model to be used for summarization tasks
+    # and allow any text-generation model to be used for text_zero_shot_classification
+    if task in ("summarization", "text_zero_shot_classification"):
         model_filter = ModelFilter(
             task=AUTOTRAIN_TASK_TO_HUB_TASK[task],
             library=["transformers", "pytorch"],
     """Creates an AutoTrain project name for the given dataset ID."""
     # Project names cannot have "/", so we need to format community datasets accordingly
     dataset_id_formatted = dataset_id.replace("/", "__")
+    dataset_config_formatted = dataset_config.replace("--", "__")
+    # Project names need to be unique, so we append a random string to guarantee this while adhering to naming rules
+    basename = f"eval-{dataset_id_formatted}-{dataset_config_formatted}"
+    basename = basename[:60] if len(basename) > 60 else basename  # Hub naming limitation
+    return f"{basename}-{str(uuid.uuid4())[:6]}"
 def get_config_metadata(config: str, metadata: List[Dict] = None) -> Union[Dict, None]: