Spaces:
Runtime error
Runtime error
Tristan Thrush
commited on
Commit
·
23ca923
1
Parent(s):
30f749f
removed requirement to be from autoeval org
Browse files
app.py
CHANGED
|
@@ -45,30 +45,24 @@ def parse_metric_value(value):
|
|
| 45 |
return value
|
| 46 |
|
| 47 |
|
| 48 |
-
def parse_metrics_rows(meta,
|
| 49 |
if not isinstance(meta["model-index"], list) or len(meta["model-index"]) == 0 or "results" not in meta["model-index"][0]:
|
| 50 |
return None
|
| 51 |
for result in meta["model-index"][0]["results"]:
|
| 52 |
if not isinstance(result, dict) or "dataset" not in result or "metrics" not in result or "type" not in result["dataset"]:
|
| 53 |
continue
|
| 54 |
dataset = result["dataset"]["type"]
|
| 55 |
-
row = {"dataset": dataset, "split": "-unspecified-", "config": "-unspecified-"
|
| 56 |
if "split" in result["dataset"]:
|
| 57 |
row["split"] = result["dataset"]["split"]
|
| 58 |
if "config" in result["dataset"]:
|
| 59 |
row["config"] = result["dataset"]["config"]
|
| 60 |
no_results = True
|
| 61 |
for metric in result["metrics"]:
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
if from_autoeval:
|
| 66 |
-
name = metric["name"].lower().strip()
|
| 67 |
-
else:
|
| 68 |
-
name = metric["type"].lower().strip()
|
| 69 |
-
|
| 70 |
-
if name in ("model_id", "dataset", "split", "config", "verified"):
|
| 71 |
-
# Metrics are not allowed to be named "dataset", "split", "config", or "verified".
|
| 72 |
continue
|
| 73 |
value = parse_metric_value(metric.get("value", None))
|
| 74 |
if value is None:
|
|
@@ -78,10 +72,7 @@ def parse_metrics_rows(meta, from_autoeval=False):
|
|
| 78 |
if name not in row or new_metric_better:
|
| 79 |
# overwrite the metric if the new value is better.
|
| 80 |
|
| 81 |
-
if
|
| 82 |
-
# if the metric is from autoeval, only include it in the leaderboard if
|
| 83 |
-
# it is a verified metric. Unverified metrics are already included
|
| 84 |
-
# in the leaderboard from the unverified model card.
|
| 85 |
if "verified" in metric and metric["verified"]:
|
| 86 |
no_results = False
|
| 87 |
row[name] = value
|
|
@@ -97,52 +88,65 @@ def get_data_wrapper():
|
|
| 97 |
|
| 98 |
def get_data():
|
| 99 |
data = []
|
| 100 |
-
|
|
|
|
| 101 |
model_ids_from_autoeval = set(get_model_ids(author="autoevaluate"))
|
| 102 |
for model_id in tqdm(model_ids):
|
| 103 |
meta = get_metadata(model_id)
|
| 104 |
if meta is None:
|
| 105 |
continue
|
| 106 |
-
for row in parse_metrics_rows(meta
|
| 107 |
if row is None:
|
| 108 |
continue
|
| 109 |
row["model_id"] = model_id
|
| 110 |
data.append(row)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
dataframe = pd.DataFrame.from_records(data)
|
| 112 |
dataframe.to_pickle("cache.pkl")
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
if exists("cache.pkl"):
|
| 115 |
# If we have saved the results previously, call an asynchronous process
|
| 116 |
# to fetch the results and update the saved file. Don't make users wait
|
| 117 |
# while we fetch the new results. Instead, display the old results for
|
| 118 |
# now. The new results should be loaded when this method
|
| 119 |
# is called again.
|
| 120 |
dataframe = pd.read_pickle("cache.pkl")
|
|
|
|
| 121 |
t = threading.Thread(name='get_data procs', target=get_data)
|
| 122 |
t.start()
|
| 123 |
else:
|
| 124 |
# We have to make the users wait during the first startup of this app.
|
| 125 |
get_data()
|
| 126 |
dataframe = pd.read_pickle("cache.pkl")
|
|
|
|
| 127 |
|
| 128 |
-
return dataframe
|
| 129 |
-
|
| 130 |
-
dataframe = get_data_wrapper()
|
| 131 |
|
| 132 |
-
|
| 133 |
|
| 134 |
st.markdown("# 🤗 Leaderboards")
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
query_params = st.experimental_get_query_params()
|
| 137 |
default_dataset = "common_voice"
|
| 138 |
if "dataset" in query_params:
|
| 139 |
if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in selectable_datasets:
|
| 140 |
default_dataset = query_params["dataset"][0]
|
| 141 |
|
| 142 |
-
only_verified_results = st.sidebar.checkbox(
|
| 143 |
-
"Filter for Verified Results",
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
dataset = st.sidebar.selectbox(
|
| 147 |
"Dataset",
|
| 148 |
selectable_datasets,
|
|
@@ -154,9 +158,6 @@ st.experimental_set_query_params(**{"dataset": [dataset]})
|
|
| 154 |
dataset_df = dataframe[dataframe.dataset == dataset]
|
| 155 |
dataset_df = dataset_df.dropna(axis="columns", how="all")
|
| 156 |
|
| 157 |
-
if only_verified_results:
|
| 158 |
-
dataset_df = dataset_df[dataset_df["verified"]]
|
| 159 |
-
|
| 160 |
selectable_configs = list(set(dataset_df["config"]))
|
| 161 |
config = st.sidebar.selectbox(
|
| 162 |
"Config",
|
|
@@ -171,7 +172,7 @@ split = st.sidebar.selectbox(
|
|
| 171 |
)
|
| 172 |
dataset_df = dataset_df[dataset_df.split == split]
|
| 173 |
|
| 174 |
-
selectable_metrics = list(filter(lambda column: column not in ("model_id", "dataset", "split", "config"
|
| 175 |
|
| 176 |
dataset_df = dataset_df.filter(["model_id"] + selectable_metrics)
|
| 177 |
dataset_df = dataset_df.dropna(thresh=2) # Want at least two non-na values (one for model_id and one for a metric).
|
|
|
|
| 45 |
return value
|
| 46 |
|
| 47 |
|
| 48 |
+
def parse_metrics_rows(meta, only_verified=False):
|
| 49 |
if not isinstance(meta["model-index"], list) or len(meta["model-index"]) == 0 or "results" not in meta["model-index"][0]:
|
| 50 |
return None
|
| 51 |
for result in meta["model-index"][0]["results"]:
|
| 52 |
if not isinstance(result, dict) or "dataset" not in result or "metrics" not in result or "type" not in result["dataset"]:
|
| 53 |
continue
|
| 54 |
dataset = result["dataset"]["type"]
|
| 55 |
+
row = {"dataset": dataset, "split": "-unspecified-", "config": "-unspecified-"}
|
| 56 |
if "split" in result["dataset"]:
|
| 57 |
row["split"] = result["dataset"]["split"]
|
| 58 |
if "config" in result["dataset"]:
|
| 59 |
row["config"] = result["dataset"]["config"]
|
| 60 |
no_results = True
|
| 61 |
for metric in result["metrics"]:
|
| 62 |
+
name = metric["type"].lower().strip()
|
| 63 |
|
| 64 |
+
if name in ("model_id", "dataset", "split", "config"):
|
| 65 |
+
# Metrics are not allowed to be named "dataset", "split", "config".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
continue
|
| 67 |
value = parse_metric_value(metric.get("value", None))
|
| 68 |
if value is None:
|
|
|
|
| 72 |
if name not in row or new_metric_better:
|
| 73 |
# overwrite the metric if the new value is better.
|
| 74 |
|
| 75 |
+
if only_verified:
|
|
|
|
|
|
|
|
|
|
| 76 |
if "verified" in metric and metric["verified"]:
|
| 77 |
no_results = False
|
| 78 |
row[name] = value
|
|
|
|
| 88 |
|
| 89 |
def get_data():
|
| 90 |
data = []
|
| 91 |
+
verified_data = []
|
| 92 |
+
model_ids = get_model_ids()[:100]
|
| 93 |
model_ids_from_autoeval = set(get_model_ids(author="autoevaluate"))
|
| 94 |
for model_id in tqdm(model_ids):
|
| 95 |
meta = get_metadata(model_id)
|
| 96 |
if meta is None:
|
| 97 |
continue
|
| 98 |
+
for row in parse_metrics_rows(meta):
|
| 99 |
if row is None:
|
| 100 |
continue
|
| 101 |
row["model_id"] = model_id
|
| 102 |
data.append(row)
|
| 103 |
+
for row in parse_metrics_rows(meta, only_verified=True):
|
| 104 |
+
if row is None:
|
| 105 |
+
continue
|
| 106 |
+
row["model_id"] = model_id
|
| 107 |
+
verified_data.append(row)
|
| 108 |
dataframe = pd.DataFrame.from_records(data)
|
| 109 |
dataframe.to_pickle("cache.pkl")
|
| 110 |
+
verified_dataframe = pd.DataFrame.from_records(verified_data)
|
| 111 |
+
verified_dataframe.to_pickle("verified_cache.pkl")
|
| 112 |
|
| 113 |
+
if exists("cache.pkl") and exists("verified_cache.pkl"):
|
| 114 |
# If we have saved the results previously, call an asynchronous process
|
| 115 |
# to fetch the results and update the saved file. Don't make users wait
|
| 116 |
# while we fetch the new results. Instead, display the old results for
|
| 117 |
# now. The new results should be loaded when this method
|
| 118 |
# is called again.
|
| 119 |
dataframe = pd.read_pickle("cache.pkl")
|
| 120 |
+
verified_dataframe = pd.read_pickle("verified_cache.pkl")
|
| 121 |
t = threading.Thread(name='get_data procs', target=get_data)
|
| 122 |
t.start()
|
| 123 |
else:
|
| 124 |
# We have to make the users wait during the first startup of this app.
|
| 125 |
get_data()
|
| 126 |
dataframe = pd.read_pickle("cache.pkl")
|
| 127 |
+
verified_dataframe = pd.read_pickle("verified_cache.pkl")
|
| 128 |
|
| 129 |
+
return dataframe, verified_dataframe
|
|
|
|
|
|
|
| 130 |
|
| 131 |
+
dataframe, verified_dataframe = get_data_wrapper()
|
| 132 |
|
| 133 |
st.markdown("# 🤗 Leaderboards")
|
| 134 |
|
| 135 |
+
only_verified_results = st.sidebar.checkbox(
|
| 136 |
+
"Filter for Verified Results",
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
if only_verified_results:
|
| 140 |
+
dataframe = verified_dataframe
|
| 141 |
+
|
| 142 |
+
selectable_datasets = list(set(dataframe.dataset.tolist()))
|
| 143 |
+
|
| 144 |
query_params = st.experimental_get_query_params()
|
| 145 |
default_dataset = "common_voice"
|
| 146 |
if "dataset" in query_params:
|
| 147 |
if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in selectable_datasets:
|
| 148 |
default_dataset = query_params["dataset"][0]
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
dataset = st.sidebar.selectbox(
|
| 151 |
"Dataset",
|
| 152 |
selectable_datasets,
|
|
|
|
| 158 |
dataset_df = dataframe[dataframe.dataset == dataset]
|
| 159 |
dataset_df = dataset_df.dropna(axis="columns", how="all")
|
| 160 |
|
|
|
|
|
|
|
|
|
|
| 161 |
selectable_configs = list(set(dataset_df["config"]))
|
| 162 |
config = st.sidebar.selectbox(
|
| 163 |
"Config",
|
|
|
|
| 172 |
)
|
| 173 |
dataset_df = dataset_df[dataset_df.split == split]
|
| 174 |
|
| 175 |
+
selectable_metrics = list(filter(lambda column: column not in ("model_id", "dataset", "split", "config"), dataset_df.columns))
|
| 176 |
|
| 177 |
dataset_df = dataset_df.filter(["model_id"] + selectable_metrics)
|
| 178 |
dataset_df = dataset_df.dropna(thresh=2) # Want at least two non-na values (one for model_id and one for a metric).
|