Spaces:

ahmedsamirio
/

farming_config_space

Sleeping

App Files Files Community

burtenshaw HF Staff commited on Apr 23, 2024

Commit

798f8ba

verified ·

1 Parent(s): 4b83e74

Upload 16 files

Browse files

Files changed (4) hide show

pages/2_👩🏼‍🔬 Describe Domain.py +22 -7
pages/3_🌱 Generate Dataset.py +54 -25
pipeline.yaml +3 -3
utils.py +5 -4

pages/2_👩🏼‍🔬 Describe Domain.py CHANGED Viewed

@@ -84,13 +84,23 @@ with tab_domain_perspectives:
     perspectives = st.session_state.get(
         "perspectives",
-        [st.text_input(f"Domain Perspective 0", value=DEFAULT_PERSPECTIVES[0])],
     )
-    if st.button("Add New Perspective"):
         n = len(perspectives)
         value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
-        perspectives.append(st.text_input(f"Domain Perspective {n}", value=""))
         st.session_state["perspectives"] = perspectives
@@ -104,14 +114,19 @@ with tab_domain_topics:
         """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
     )
     topics = st.session_state.get(
-        "topics", [st.text_input(f"Domain Topic 0", value=DEFAULT_TOPICS[0])]
     )
-    new_topic = st.button("Add New Topic")
-    if new_topic:
         n = len(topics)
         value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
-        topics.append(st.text_input(f"Domain Topic {n}", value=value))
         st.session_state["topics"] = topics

     perspectives = st.session_state.get(
         "perspectives",
+        [DEFAULT_PERSPECTIVES[0]],
     )
+    perspectives_container = st.container()
+    perspectives = [
+        perspectives_container.text_input(
+            f"Domain Perspective {i + 1}", value=perspective
+        )
+        for i, perspective in enumerate(perspectives)
+    ]
+    if st.button("Add Perspective", key="add_perspective"):
         n = len(perspectives)
         value = DEFAULT_PERSPECTIVES[n] if n < N_PERSPECTIVES else ""
+        perspectives.append(
+            perspectives_container.text_input(f"Domain Perspective {n + 1}", value="")
+        )
         st.session_state["perspectives"] = perspectives
         """Topics are the main themes or subjects that are relevant to the domain. For example, the domain of farming can have topics like soil health, crop rotation, or livestock management."""
     )
     topics = st.session_state.get(
+        "topics",
+        [DEFAULT_TOPICS[0]],
     )
+    topics_container = st.container()
+    topics = [
+        topics_container.text_input(f"Domain Topic {i + 1}", value=topic)
+        for i, topic in enumerate(topics)
+    ]
+    if st.button("Add Topic", key="add_topic"):
         n = len(topics)
         value = DEFAULT_TOPICS[n] if n < N_TOPICS else ""
+        topics.append(topics_container.text_input(f"Domain Topics {n + 1}", value=""))
         st.session_state["topics"] = topics

pages/3_🌱 Generate Dataset.py CHANGED Viewed

@@ -27,38 +27,57 @@ project_sidebar()
 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
-st.write(
-    "Define the project details, including the project name, domain, and API credentials"
-)
 ###############################################################
 # CONFIGURATION
 ###############################################################
-st.divider()
-st.markdown("### Pipeline Configuration")
-st.write("🤗 Hub details to pull the seed data")
 hub_username = st.text_input("Hub Username", HUB_USERNAME)
 project_name = st.text_input("Project Name", PROJECT_NAME)
 repo_id = f"{hub_username}/{project_name}"
 hub_token = st.text_input("Hub Token", type="password")
-st.write("🤖 Inference configuration")
 st.write(
     "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
 )
-st.link_button(
-    "🤗 Inference compaptible models on the hub",
-    "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
-)
-base_url = st.text_input("Base URL")
-st.write("🔬 Argilla API details to push the generated dataset")
 argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
 argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
 argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
@@ -68,7 +87,7 @@ st.divider()
 # LOCAL
 ###############################################################
-st.markdown("### Run the pipeline")
 st.write(
     "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
@@ -101,10 +120,15 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
         ]
     ):
         with st.spinner("Pulling seed data from the Hub..."):
-            seed_data = pull_seed_data_from_repo(
-                repo_id=f"{hub_username}/{project_name}",
-                hub_token=hub_token,
-            )
             domain = seed_data["domain"]
             perspectives = seed_data["perspectives"]
@@ -177,17 +201,22 @@ if CODELESS_DISTILABEL:
             ]
         ):
             with st.spinner("Pulling seed data from the Hub..."):
-                seed_data = pull_seed_data_from_repo(
-                    repo_id=f"{hub_username}/{project_name}",
-                    hub_token=hub_token,
-                )
                 domain = seed_data["domain"]
                 perspectives = seed_data["perspectives"]
                 topics = seed_data["topics"]
                 examples = seed_data["examples"]
                 domain_expert_prompt = seed_data["domain_expert_prompt"]
-            with st.spinner("Serializing the pipeline configuration..."):
                 serialize_pipeline(
                     argilla_api_key=argilla_api_key,
                     argilla_dataset_name=argilla_dataset_name,

 st.header("🧑‍🌾 Domain Data Grower")
 st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
+st.write("Define the project repos and models that the pipeline will use.")
+st.divider()
 ###############################################################
 # CONFIGURATION
 ###############################################################
+st.markdown("## Pipeline Configuration")
+st.markdown("#### 🤗 Hub details to pull the seed data")
 hub_username = st.text_input("Hub Username", HUB_USERNAME)
 project_name = st.text_input("Project Name", PROJECT_NAME)
 repo_id = f"{hub_username}/{project_name}"
 hub_token = st.text_input("Hub Token", type="password")
+st.divider()
+st.markdown("#### 🤖 Inference configuration")
 st.write(
     "Add the url of the Huggingface inference API or endpoint that your pipeline should use. You can find compatible models here:"
 )
+with st.expander("🤗 Recommended Models"):
+    st.write("All inference endpoint compatible models can be found via the link below")
+    st.link_button(
+        "🤗 Inference compaptible models on the hub",
+        "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
+    )
+    st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
+    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
+    st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
+    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
+    st.write("🍃Projects with even less resources could take advantage of Phi-2")
+    st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
+    st.write("Note Hugggingface Pro gives access to more compute resources")
+    st.link_button(
+        "🤗 Huggingface Pro",
+        "https://huggingface.co/pricing",
+    )
+base_url = st.text_input(
+    label="Base URL for the Inference API",
+    value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
+)
+st.divider()
+st.markdown("#### 🔬 Argilla API details to push the generated dataset")
 argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
 argilla_api_key = st.text_input("Argilla API Key", "owner.apikey")
 argilla_dataset_name = st.text_input("Argilla Dataset Name", project_name)
 # LOCAL
 ###############################################################
+st.markdown("## Run the pipeline")
 st.write(
     "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
         ]
     ):
         with st.spinner("Pulling seed data from the Hub..."):
+            try:
+                seed_data = pull_seed_data_from_repo(
+                    repo_id=f"{hub_username}/{project_name}",
+                    hub_token=hub_token,
+                )
+            except Exception:
+                st.error(
+                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
+                )
             domain = seed_data["domain"]
             perspectives = seed_data["perspectives"]
             ]
         ):
             with st.spinner("Pulling seed data from the Hub..."):
+                try:
+                    seed_data = pull_seed_data_from_repo(
+                        repo_id=f"{hub_username}/{project_name}",
+                        hub_token=hub_token,
+                    )
+                except Exception as e:
+                    st.error(
+                        "Seed data not found. Please make sure you pushed the data seed in Step 2."
+                    )
                 domain = seed_data["domain"]
                 perspectives = seed_data["perspectives"]
                 topics = seed_data["topics"]
                 examples = seed_data["examples"]
                 domain_expert_prompt = seed_data["domain_expert_prompt"]
                 serialize_pipeline(
                     argilla_api_key=argilla_api_key,
                     argilla_dataset_name=argilla_dataset_name,

pipeline.yaml CHANGED Viewed

@@ -54,7 +54,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -163,7 +163,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -390,7 +390,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false

         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/microsoft/phi-2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false

utils.py CHANGED Viewed

@@ -18,15 +18,16 @@ def project_sidebar():
         )
         st.stop()
     st.sidebar.markdown(
-        """
-        ## 🌱 Domain Data Grower
         This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
         """
     )
-    st.sidebar.subheader(f"Project Details: {PROJECT_NAME}")
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
     st.sidebar.divider()
     st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)

         )
         st.stop()
+    st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
     st.sidebar.markdown(
+        """
         This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
         """
     )
     st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
     st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
     st.sidebar.divider()
     st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
+    st.sidebar.link_button(
+        "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
+    )