Spaces:
Running
Running
theo
commited on
Commit
·
c4882f0
1
Parent(s):
ef36700
rely on tagsets from datasets
Browse files- tagging_app.py +18 -45
tagging_app.py
CHANGED
|
@@ -5,7 +5,9 @@ from typing import Callable, Dict, List, Tuple
|
|
| 5 |
import langcodes as lc
|
| 6 |
import streamlit as st
|
| 7 |
import yaml
|
| 8 |
-
from datasets.utils.metadata import DatasetMetadata
|
|
|
|
|
|
|
| 9 |
|
| 10 |
st.set_page_config(
|
| 11 |
page_title="HF Dataset Tagging App",
|
|
@@ -26,34 +28,6 @@ st.markdown(
|
|
| 26 |
unsafe_allow_html=True,
|
| 27 |
)
|
| 28 |
|
| 29 |
-
task_set = json.load(open("task_set.json"))
|
| 30 |
-
license_set = json.load(open("license_set.json"))
|
| 31 |
-
|
| 32 |
-
multilinguality_set = {
|
| 33 |
-
"monolingual": "contains a single language",
|
| 34 |
-
"multilingual": "contains multiple languages",
|
| 35 |
-
"translation": "contains translated or aligned text",
|
| 36 |
-
"other": "other type of language distribution",
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
creator_set = {
|
| 40 |
-
"language": [
|
| 41 |
-
"found",
|
| 42 |
-
"crowdsourced",
|
| 43 |
-
"expert-generated",
|
| 44 |
-
"machine-generated",
|
| 45 |
-
"other",
|
| 46 |
-
],
|
| 47 |
-
"annotations": [
|
| 48 |
-
"found",
|
| 49 |
-
"crowdsourced",
|
| 50 |
-
"expert-generated",
|
| 51 |
-
"machine-generated",
|
| 52 |
-
"no-annotation",
|
| 53 |
-
"other",
|
| 54 |
-
],
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
########################
|
| 58 |
## Helper functions
|
| 59 |
########################
|
|
@@ -117,7 +91,7 @@ def new_state() -> Dict[str, List]:
|
|
| 117 |
|
| 118 |
|
| 119 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
| 120 |
-
return sum(len(v) if v is not None else 0 for v in state.values())
|
| 121 |
|
| 122 |
|
| 123 |
state = new_state()
|
|
@@ -160,7 +134,7 @@ if leftbtn.button("pre-load"):
|
|
| 160 |
initial_state = existing_tag_sets[preloaded_id]
|
| 161 |
state = initial_state or new_state()
|
| 162 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
| 163 |
-
if is_state_empty(state):
|
| 164 |
if rightbtn.button("flush state"):
|
| 165 |
state = new_state()
|
| 166 |
initial_state = None
|
|
@@ -195,8 +169,8 @@ state["task_categories"] = multiselect(
|
|
| 195 |
"Task category",
|
| 196 |
"What categories of task does the dataset support?",
|
| 197 |
values=state["task_categories"],
|
| 198 |
-
valid_set=list(
|
| 199 |
-
format_func=lambda tg: f"{tg}: {
|
| 200 |
)
|
| 201 |
task_specifics = []
|
| 202 |
for tg in state["task_categories"]:
|
|
@@ -204,8 +178,8 @@ for tg in state["task_categories"]:
|
|
| 204 |
leftcol,
|
| 205 |
f"Specific _{tg}_ tasks",
|
| 206 |
f"What specific tasks does the dataset support?",
|
| 207 |
-
values=[ts for ts in (state["task_ids"] or []) if ts in
|
| 208 |
-
valid_set=
|
| 209 |
)
|
| 210 |
if "other" in specs:
|
| 211 |
other_task = st.text_input(
|
|
@@ -224,8 +198,8 @@ state["multilinguality"] = multiselect(
|
|
| 224 |
"Monolingual?",
|
| 225 |
"Does the dataset contain more than one language?",
|
| 226 |
values=state["multilinguality"],
|
| 227 |
-
valid_set=list(
|
| 228 |
-
format_func=lambda m: f"{m} : {
|
| 229 |
)
|
| 230 |
|
| 231 |
if "other" in state["multilinguality"]:
|
|
@@ -260,14 +234,14 @@ state["language_creators"] = multiselect(
|
|
| 260 |
"Data origin",
|
| 261 |
"Where does the text in the dataset come from?",
|
| 262 |
values=state["language_creators"],
|
| 263 |
-
valid_set=
|
| 264 |
)
|
| 265 |
state["annotations_creators"] = multiselect(
|
| 266 |
leftcol,
|
| 267 |
"Annotations origin",
|
| 268 |
"Where do the annotations in the dataset come from?",
|
| 269 |
values=state["annotations_creators"],
|
| 270 |
-
valid_set=
|
| 271 |
)
|
| 272 |
|
| 273 |
|
|
@@ -275,9 +249,9 @@ state["licenses"] = multiselect(
|
|
| 275 |
leftcol,
|
| 276 |
"Licenses",
|
| 277 |
"What licenses is the dataset under?",
|
| 278 |
-
valid_set=list(
|
| 279 |
values=state["licenses"],
|
| 280 |
-
format_func=lambda l: f"{l} : {
|
| 281 |
)
|
| 282 |
if "other" in state["licenses"]:
|
| 283 |
other_license = st.text_input(
|
|
@@ -320,16 +294,15 @@ if "extended" in state["extended"]:
|
|
| 320 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
| 321 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
| 322 |
|
| 323 |
-
size_cats = ["unknown", "n<1K", "1K<n<10K", "10K<n<100K", "100K<n<1M", "n>1M", ...]
|
| 324 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
| 325 |
-
ok, nonok = split_known(current_size_cats,
|
| 326 |
if len(nonok) > 0:
|
| 327 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 328 |
state["size_categories"] = [
|
| 329 |
leftcol.selectbox(
|
| 330 |
"What is the size category of the dataset?",
|
| 331 |
-
options=
|
| 332 |
-
index=
|
| 333 |
)
|
| 334 |
]
|
| 335 |
|
|
|
|
| 5 |
import langcodes as lc
|
| 6 |
import streamlit as st
|
| 7 |
import yaml
|
| 8 |
+
from datasets.utils.metadata import (DatasetMetadata, known_creators,
|
| 9 |
+
known_licenses, known_multilingualities,
|
| 10 |
+
known_size_categories, known_task_ids)
|
| 11 |
|
| 12 |
st.set_page_config(
|
| 13 |
page_title="HF Dataset Tagging App",
|
|
|
|
| 28 |
unsafe_allow_html=True,
|
| 29 |
)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
########################
|
| 32 |
## Helper functions
|
| 33 |
########################
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def is_state_empty(state: Dict[str, List]) -> bool:
|
| 94 |
+
return sum(len(v) if v is not None else 0 for v in state.values()) == 0
|
| 95 |
|
| 96 |
|
| 97 |
state = new_state()
|
|
|
|
| 134 |
initial_state = existing_tag_sets[preloaded_id]
|
| 135 |
state = initial_state or new_state()
|
| 136 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
| 137 |
+
if not is_state_empty(state):
|
| 138 |
if rightbtn.button("flush state"):
|
| 139 |
state = new_state()
|
| 140 |
initial_state = None
|
|
|
|
| 169 |
"Task category",
|
| 170 |
"What categories of task does the dataset support?",
|
| 171 |
values=state["task_categories"],
|
| 172 |
+
valid_set=list(known_task_ids.keys()),
|
| 173 |
+
format_func=lambda tg: f"{tg}: {known_task_ids[tg]['description']}",
|
| 174 |
)
|
| 175 |
task_specifics = []
|
| 176 |
for tg in state["task_categories"]:
|
|
|
|
| 178 |
leftcol,
|
| 179 |
f"Specific _{tg}_ tasks",
|
| 180 |
f"What specific tasks does the dataset support?",
|
| 181 |
+
values=[ts for ts in (state["task_ids"] or []) if ts in known_task_ids[tg]["options"]],
|
| 182 |
+
valid_set=known_task_ids[tg]["options"],
|
| 183 |
)
|
| 184 |
if "other" in specs:
|
| 185 |
other_task = st.text_input(
|
|
|
|
| 198 |
"Monolingual?",
|
| 199 |
"Does the dataset contain more than one language?",
|
| 200 |
values=state["multilinguality"],
|
| 201 |
+
valid_set=list(known_multilingualities.keys()),
|
| 202 |
+
format_func=lambda m: f"{m} : {known_multilingualities[m]}",
|
| 203 |
)
|
| 204 |
|
| 205 |
if "other" in state["multilinguality"]:
|
|
|
|
| 234 |
"Data origin",
|
| 235 |
"Where does the text in the dataset come from?",
|
| 236 |
values=state["language_creators"],
|
| 237 |
+
valid_set=known_creators["language"],
|
| 238 |
)
|
| 239 |
state["annotations_creators"] = multiselect(
|
| 240 |
leftcol,
|
| 241 |
"Annotations origin",
|
| 242 |
"Where do the annotations in the dataset come from?",
|
| 243 |
values=state["annotations_creators"],
|
| 244 |
+
valid_set=known_creators["annotations"],
|
| 245 |
)
|
| 246 |
|
| 247 |
|
|
|
|
| 249 |
leftcol,
|
| 250 |
"Licenses",
|
| 251 |
"What licenses is the dataset under?",
|
| 252 |
+
valid_set=list(known_licenses.keys()),
|
| 253 |
values=state["licenses"],
|
| 254 |
+
format_func=lambda l: f"{l} : {known_licenses[l]}",
|
| 255 |
)
|
| 256 |
if "other" in state["licenses"]:
|
| 257 |
other_license = st.text_input(
|
|
|
|
| 294 |
extended_sources[extended_sources.index("other")] = f"other-{other_extended_sources}"
|
| 295 |
state["source_datasets"] += [f"extended|{src}" for src in extended_sources]
|
| 296 |
|
|
|
|
| 297 |
current_size_cats = state.get("size_categories") or ["unknown"]
|
| 298 |
+
ok, nonok = split_known(current_size_cats, known_size_categories)
|
| 299 |
if len(nonok) > 0:
|
| 300 |
leftcol.markdown(f"**Found bad codes in existing tagset**:\n{nonok}")
|
| 301 |
state["size_categories"] = [
|
| 302 |
leftcol.selectbox(
|
| 303 |
"What is the size category of the dataset?",
|
| 304 |
+
options=known_size_categories,
|
| 305 |
+
index=known_size_categories.index(ok[0]) if len(ok) > 0 else 0,
|
| 306 |
)
|
| 307 |
]
|
| 308 |
|