Upload 4 files
Browse files- app.py +101 -0
- defaults.py +7 -0
- hub.py +50 -0
- pages/🧑🌾 Domain Data Grower.py +15 -0
app.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
from regex import F
|
| 4 |
+
from defaults import (
|
| 5 |
+
DEFAULT_DOMAIN,
|
| 6 |
+
)
|
| 7 |
+
from hub import (
|
| 8 |
+
setup_dataset_on_hub,
|
| 9 |
+
duplicate_space_on_hub,
|
| 10 |
+
add_project_config_to_space_repo,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
import streamlit as st
|
| 14 |
+
|
| 15 |
+
st.set_page_config("Domain Data Grower", page_icon="🧑🌾")
|
| 16 |
+
st.header("🧑🌾 Domain Data Grower")
|
| 17 |
+
st.divider()
|
| 18 |
+
|
| 19 |
+
################################################################################
|
| 20 |
+
# APP MARKDOWN
|
| 21 |
+
################################################################################
|
| 22 |
+
|
| 23 |
+
st.header("🌱 Create a domain specific dataset")
|
| 24 |
+
|
| 25 |
+
st.markdown(
|
| 26 |
+
"""This space will set up your domain specific dataset project. It will
|
| 27 |
+
create the resources that you need to build a dataset. Those resources include:
|
| 28 |
+
|
| 29 |
+
- A dataset repository on the Hub
|
| 30 |
+
- Another space to define expert domain and run generation pipelines
|
| 31 |
+
|
| 32 |
+
For a complete overview of the project. Check out the README
|
| 33 |
+
"""
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
st.page_link(
|
| 37 |
+
"pages/🧑🌾 Domain Data Grower.py",
|
| 38 |
+
label="Domain Data Grower",
|
| 39 |
+
icon="🧑🌾",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
################################################################################
|
| 43 |
+
# CONFIGURATION
|
| 44 |
+
################################################################################
|
| 45 |
+
|
| 46 |
+
st.subheader("🌾 Project Configuration")
|
| 47 |
+
|
| 48 |
+
project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
|
| 49 |
+
hub_username = st.text_input("Hub Username", "argilla")
|
| 50 |
+
hub_token = st.text_input("Hub Token", type="password")
|
| 51 |
+
private_selector = st.checkbox("Private Space", value=False)
|
| 52 |
+
|
| 53 |
+
if st.button("🤗 Setup Project Resources"):
|
| 54 |
+
repo_id = f"{hub_username}/{project_name}"
|
| 55 |
+
|
| 56 |
+
setup_dataset_on_hub(
|
| 57 |
+
repo_id=repo_id,
|
| 58 |
+
hub_token=hub_token,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
st.success(
|
| 62 |
+
f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
space_name = f"{project_name}_config_space"
|
| 66 |
+
|
| 67 |
+
duplicate_space_on_hub(
|
| 68 |
+
source_repo="argilla/domain-specific-datasets-template",
|
| 69 |
+
target_repo=space_name,
|
| 70 |
+
hub_token=hub_token,
|
| 71 |
+
private=private_selector,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
st.success(
|
| 75 |
+
f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
argilla_name = f"{project_name}_argilla_space"
|
| 79 |
+
|
| 80 |
+
duplicate_space_on_hub(
|
| 81 |
+
source_repo="argilla/argilla-template-space",
|
| 82 |
+
target_repo=argilla_name,
|
| 83 |
+
hub_token=hub_token,
|
| 84 |
+
private=private_selector,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
st.success(
|
| 88 |
+
f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
seconds = 5
|
| 92 |
+
|
| 93 |
+
with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
|
| 94 |
+
time.sleep(seconds)
|
| 95 |
+
add_project_config_to_space_repo(
|
| 96 |
+
dataset_repo_id=repo_id,
|
| 97 |
+
hub_token=hub_token,
|
| 98 |
+
project_name=project_name,
|
| 99 |
+
argilla_space_repo_id=f"{hub_username}/{argilla_name}",
|
| 100 |
+
project_space_repo_id=f"{hub_username}/{space_name}",
|
| 101 |
+
)
|
defaults.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
SEED_DATA_PATH = "seed_data.json"
|
| 4 |
+
|
| 5 |
+
with open(SEED_DATA_PATH) as f:
|
| 6 |
+
DEFAULT_DATA = json.load(f)
|
| 7 |
+
DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
|
hub.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from huggingface_hub import duplicate_space, HfApi
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
hf_api = HfApi()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def setup_dataset_on_hub(repo_id, hub_token):
|
| 10 |
+
# create an empty dataset repo on the hub
|
| 11 |
+
hf_api.create_repo(
|
| 12 |
+
repo_id=repo_id,
|
| 13 |
+
token=hub_token,
|
| 14 |
+
repo_type="dataset",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
|
| 19 |
+
duplicate_space(
|
| 20 |
+
from_id=source_repo, to_id=target_repo, token=hub_token, private=private
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def add_project_config_to_space_repo(
|
| 25 |
+
dataset_repo_id,
|
| 26 |
+
hub_token,
|
| 27 |
+
project_name,
|
| 28 |
+
argilla_space_repo_id,
|
| 29 |
+
project_space_repo_id,
|
| 30 |
+
):
|
| 31 |
+
# upload the seed data and readme to the hub
|
| 32 |
+
|
| 33 |
+
with open("project_config.json", "w") as f:
|
| 34 |
+
json.dump(
|
| 35 |
+
{
|
| 36 |
+
"project_name": project_name,
|
| 37 |
+
"argilla_space_repo_id": argilla_space_repo_id,
|
| 38 |
+
"project_space_repo_id": project_space_repo_id,
|
| 39 |
+
"dataset_repo_id": dataset_repo_id,
|
| 40 |
+
},
|
| 41 |
+
f,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
hf_api.upload_file(
|
| 45 |
+
path_or_fileobj="project_config.json",
|
| 46 |
+
path_in_repo="project_config.json",
|
| 47 |
+
token=hub_token,
|
| 48 |
+
repo_id=project_space_repo_id,
|
| 49 |
+
repo_type="space",
|
| 50 |
+
)
|
pages/🧑🌾 Domain Data Grower.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/4d7848149dcfe575b86517ca15e4aaa09dc9db74/domain-specific-datasets/README.md"
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def open_markdown_file(url):
|
| 9 |
+
response = requests.get(url)
|
| 10 |
+
return response.text
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
readme = open_markdown_file(readme_location)
|
| 14 |
+
|
| 15 |
+
st.markdown(readme)
|