Spaces:

argilla
/

domain-specific-datasets-welcome

Running

App Files Files Community

Ben Burtenshaw commited on Apr 26, 2024

Commit

3c9d064

1 Parent(s): 0ac0929

transfer pipeline

Browse files

Files changed (4) hide show

app.py +8 -0
hub.py +31 -0
pipeline.py +183 -0
utils.py +0 -33

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from hub import (
     setup_dataset_on_hub,
     duplicate_space_on_hub,
     add_project_config_to_space_repo,
 )
 import streamlit as st
@@ -107,6 +108,13 @@ if st.button("🤗 Setup Project Resources"):
             argilla_space_repo_id=f"{hub_username}/{argilla_name}",
             project_space_repo_id=f"{hub_username}/{space_name}",
         )
     st.subheader("👢 Next Steps")

     setup_dataset_on_hub,
     duplicate_space_on_hub,
     add_project_config_to_space_repo,
+    push_pipeline_to_hub,
 )
 import streamlit as st
             argilla_space_repo_id=f"{hub_username}/{argilla_name}",
             project_space_repo_id=f"{hub_username}/{space_name}",
         )
+        push_pipeline_to_hub(
+            pipeline_path="pipeline.py",
+            hub_username=hub_username,
+            hub_token=hub_token,
+            project_name=project_name,
+        )
     st.subheader("👢 Next Steps")

hub.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import json
 from huggingface_hub import duplicate_space, HfApi
@@ -61,3 +63,32 @@ def add_project_config_to_space_repo(
         repo_id=project_space_repo_id,
         repo_type="space",
     )

 import json
+from tempfile import mktemp
 from huggingface_hub import duplicate_space, HfApi
         repo_id=project_space_repo_id,
         repo_type="space",
     )
+def pull_seed_data_from_repo(repo_id, hub_token):
+    tempfile_path = mktemp()
+    # pull the dataset repo from the hub
+    hf_api.hf_hub_download(
+        repo_id=repo_id, token=hub_token, repo_type="dataset", filename=tempfile_path
+    )
+    return json.load(open(tempfile_path))
+def push_pipeline_to_hub(
+    pipeline_path,
+    hub_username,
+    hub_token: str,
+    project_name,
+):
+    repo_id = f"{hub_username}/{project_name}"
+    # upload the pipeline to the hub
+    hf_api.upload_file(
+        path_or_fileobj=pipeline_path,
+        path_in_repo="pipeline.py",
+        token=hub_token,
+        repo_id=repo_id,
+        repo_type="dataset",
+    )
+    print(f"pipeline.py uploaded to {repo_id}")

pipeline.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import json
+from textwrap import dedent
+from typing import Any, Dict, List
+from distilabel.llms.huggingface import InferenceEndpointsLLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps import TextGenerationToArgilla
+from distilabel.steps.expand import ExpandColumns
+from distilabel.steps.generators.data import LoadDataFromDicts
+from distilabel.steps.tasks.self_instruct import SelfInstruct
+from distilabel.steps.tasks.text_generation import TextGeneration
+from distilabel.steps.tasks.typing import ChatType
+################################################################################
+# Functions to create task prompts
+################################################################################
+def create_application_instruction(domain: str, examples: List[Dict[str, str]]):
+    """Create the instruction for Self-Instruct task."""
+    system_prompt = dedent(
+        f"""You are an AI assistant than generates queries around the domain of {domain}.
+            Your should not expect basic but profound questions from your users.
+            The queries should reflect a diversxamity of vision and economic positions and political positions.
+            The queries may know about different methods of {domain}.
+            The queries can be positioned politically, economically, socially, or practically.
+            Also take into account the impact of diverse causes on diverse domains."""
+    )
+    for example in examples:
+        question = example["question"]
+        answer = example["answer"]
+        system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
+def create_seed_terms(topics: List[str], perspectives: List[str]) -> List[str]:
+    """Create seed terms for self intruct to start from."""
+    return [
+        f"{topic} from a {perspective} perspective"
+        for topic in topics
+        for perspective in perspectives
+    ]
+################################################################################
+# Define out custom step for the domain expert
+################################################################################
+class DomainExpert(TextGeneration):
+    """A customized task to generate text as a domain expert in the domain of farming and agriculture."""
+    system_prompt: str
+    template: str = """This is the the instruction: {instruction}"""
+    def format_input(self, input: Dict[str, Any]) -> "ChatType":
+        return [
+            {
+                "role": "system",
+                "content": self.system_prompt,
+            },
+            {
+                "role": "user",
+                "content": self.template.format(**input),
+            },
+        ]
+################################################################################
+# Main script to run the pipeline
+################################################################################
+if __name__ == "__main__":
+    import argparse
+    import json
+    parser = argparse.ArgumentParser(
+        description="Run the pipeline to generate domain-specific datasets."
+    )
+    parser.add_argument("--hub-token", type=str, help="The Hugging Face API token.")
+    parser.add_argument("--argilla-api-key", type=str, help="The Argilla API key.")
+    parser.add_argument("--argilla-api-url", type=str, help="The Argilla API URL.")
+    parser.add_argument(
+        "--argilla-dataset-name", type=str, help="The name of the dataset in Argilla."
+    )
+    parser.add_argument(
+        "--seed_data_path",
+        type=str,
+        help="The path to the seed data.",
+        default="seed_data.json",
+    )
+    parser.add_argument(
+        "--endpoint-base-url", type=str, help="The base URL of the inference endpoint."
+    )
+    args = parser.parse_args()
+    # collect our seed data
+    with open(args.seed_data_path, "r") as f:
+        seed_data = json.load(f)
+    topics = seed_data.get("topics", [])
+    perspectives = seed_data.get("perspectives", [])
+    domain_expert_prompt = seed_data.get("domain_expert_prompt", "")
+    examples = seed_data.get("examples", [])
+    domain_name = seed_data.get("domain_name", "domain")
+    # Define the task prompts
+    terms = create_seed_terms(topics=topics, perspectives=perspectives)
+    application_instruction = create_application_instruction(
+        domain=domain_name, examples=examples
+    )
+    # Define the distilabel pipeline
+    with Pipeline(domain_name) as pipeline:
+        load_data = LoadDataFromDicts(
+            name="load_data",
+            data=[{"input": term} for term in terms],
+            batch_size=64,
+        )
+        self_instruct = SelfInstruct(
+            name="self_instruct",
+            num_instructions=5,
+            input_batch_size=8,
+            llm=InferenceEndpointsLLM(
+                base_url=args.endpoint_base_url,
+                api_key=args.hub_token,
+            ),
+        )
+        expand_instructions = ExpandColumns(
+            name="expand_columns", columns={"instructions": "instruction"}
+        )
+        domain_expert = DomainExpert(
+            name="domain_expert",
+            llm=InferenceEndpointsLLM(
+                base_url=args.endpoint_base_url,
+                api_key=args.hub_token,
+            ),
+            input_batch_size=8,
+            system_prompt=domain_expert_prompt,
+        )
+        to_argilla = TextGenerationToArgilla(
+            name="text_generation_to_argilla",
+            dataset_name=args.argilla_dataset_name,
+            dataset_workspace="admin",
+            api_url=args.argilla_api_url,
+            api_key=args.argilla_api_key,
+        )
+        # Connect up the pipeline
+        load_data.connect(self_instruct)
+        self_instruct.connect(expand_instructions)
+        expand_instructions.connect(domain_expert)
+        domain_expert.connect(to_argilla)
+    # Run the pipeline
+    pipeline.run(
+        parameters={
+            "self_instruct": {
+                "llm": {"api_key": args.hub_token, "base_url": args.endpoint_base_url}
+            },
+            "domain_expert": {
+                "llm": {"api_key": args.hub_token, "base_url": args.endpoint_base_url}
+            },
+            "text_generation_to_argilla": {
+                "dataset_name": args.argilla_dataset_name,
+                "api_key": args.argilla_api_key,
+                "api_url": args.argilla_api_url,
+            },
+        },
+        use_cache=False,
+    )

utils.py DELETED Viewed

@@ -1,33 +0,0 @@
-import streamlit as st
-from defaults import (
-    ARGILLA_SPACE_REPO_ID,
-    PROJECT_NAME,
-    ARGILLA_URL,
-    DIBT_PARENT_APP_URL,
-    DATASET_URL,
-    DATASET_REPO_ID,
-    ARGILLA_SPACE_REPO_ID,
-)
-def project_sidebar():
-    if PROJECT_NAME == "DEFAULT_DOMAIN":
-        st.warning(
-            "Please set up the project configuration in the parent app before proceeding."
-        )
-        st.stop()
-    st.sidebar.subheader(f"A Data Growing Project in the domain of {PROJECT_NAME}")
-    st.sidebar.markdown(
-        """
-        This space helps you create a dataset seed for building diverse domain-specific datasets for aligning models.
-        """
-    )
-    st.sidebar.link_button(f"📚 Dataset Repo", DATASET_URL)
-    st.sidebar.link_button(f"🤖 Argilla Space", ARGILLA_URL)
-    st.sidebar.divider()
-    st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
-    st.sidebar.link_button(
-        "🤗 Get your Hub Token", "https://huggingface.co/settings/tokens"
-    )