Commit
·
6c21ae3
1
Parent(s):
33c8677
Add languages
Browse files- changelog.md +7 -0
- language.py +0 -0
- models.py +126 -105
- utils.py +69 -0
changelog.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
| 1 |
Changelog
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
v0.1
|
| 4 |
- Allow pick comparison version
|
| 5 |
- Show delta in all metrics
|
|
|
|
| 1 |
Changelog
|
| 2 |
|
| 3 |
+
v0.2 - Oct 24
|
| 4 |
+
- Languages
|
| 5 |
+
- Allow filtering for modality
|
| 6 |
+
- Show new languages for the diff
|
| 7 |
+
- Show rate of change in languages
|
| 8 |
+
- Also include multilingual tag as multilingual for model selection in languages
|
| 9 |
+
|
| 10 |
v0.1
|
| 11 |
- Allow pick comparison version
|
| 12 |
- Show delta in all metrics
|
language.py
ADDED
|
File without changes
|
models.py
CHANGED
|
@@ -1,89 +1,62 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
-
from datasets import load_dataset
|
| 4 |
from ast import literal_eval
|
| 5 |
import altair as alt
|
| 6 |
-
import plotly.graph_objs as go
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
|
| 9 |
-
|
| 10 |
-
print("Build")
|
| 11 |
-
nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
|
| 12 |
-
"translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
|
| 13 |
-
]
|
| 14 |
-
audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
|
| 15 |
-
cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
|
| 16 |
-
multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
|
| 17 |
-
tabular = ["tabular-classification", "tabular-regression"]
|
| 18 |
-
|
| 19 |
-
modalities = {
|
| 20 |
-
"nlp": nlp_tasks,
|
| 21 |
-
"audio": audio_tasks,
|
| 22 |
-
"cv": cv_tasks,
|
| 23 |
-
"multimodal": multimodal,
|
| 24 |
-
"tabular": tabular,
|
| 25 |
-
"rl": ["reinforcement-learning"]
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
def modality(row):
|
| 29 |
-
pipeline = row["pipeline"]
|
| 30 |
-
for modality, tasks in modalities.items():
|
| 31 |
-
if pipeline in tasks:
|
| 32 |
-
return modality
|
| 33 |
-
if type(pipeline) == "str":
|
| 34 |
-
return "unk_modality"
|
| 35 |
-
return None
|
| 36 |
|
|
|
|
|
|
|
| 37 |
supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
|
| 38 |
-
|
| 39 |
-
st.cache(allow_output_mutation=True)
|
| 40 |
-
def process_dataset(version):
|
| 41 |
-
# Load dataset at specified revision
|
| 42 |
-
dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
|
| 43 |
-
|
| 44 |
-
# Convert to pandas dataframe
|
| 45 |
-
data = dataset["train"].to_pandas()
|
| 46 |
-
|
| 47 |
-
# Add modality column
|
| 48 |
-
data["modality"] = data.apply(modality, axis=1)
|
| 49 |
-
|
| 50 |
-
# Bin the model card length into some bins
|
| 51 |
-
data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
|
| 52 |
-
|
| 53 |
-
return data
|
| 54 |
-
|
| 55 |
-
col1, col2 = st.columns(2)
|
| 56 |
with col1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
base = st.selectbox(
|
| 58 |
'Old revision',
|
| 59 |
supported_revisions,
|
| 60 |
index=1)
|
| 61 |
-
with
|
| 62 |
-
|
| 63 |
-
'
|
| 64 |
supported_revisions,
|
| 65 |
-
index=
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
old_data = process_dataset(base)
|
| 68 |
data = process_dataset(new)
|
| 69 |
-
|
| 70 |
-
def eval_tags(row):
|
| 71 |
-
tags = row["tags"]
|
| 72 |
-
if tags == "none" or tags == [] or tags == "{}":
|
| 73 |
-
return []
|
| 74 |
-
if tags[0] != "[":
|
| 75 |
-
tags = str([tags])
|
| 76 |
-
val = literal_eval(tags)
|
| 77 |
-
if isinstance(val, dict):
|
| 78 |
-
return []
|
| 79 |
-
return val
|
| 80 |
-
|
| 81 |
old_data["tags"] = old_data.apply(eval_tags, axis=1)
|
| 82 |
data["tags"] = data.apply(eval_tags, axis=1)
|
| 83 |
|
|
|
|
|
|
|
| 84 |
total_samples_old = old_data.shape[0]
|
| 85 |
total_samples = data.shape[0]
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Tabs don't work in Spaces st version
|
| 89 |
#tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
|
|
@@ -92,20 +65,9 @@ def main():
|
|
| 92 |
'Topic of interest',
|
| 93 |
["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
|
| 94 |
|
| 95 |
-
# with tab1:
|
| 96 |
if tab == "Language":
|
| 97 |
st.header("Languages info")
|
| 98 |
|
| 99 |
-
data.loc[data.languages == "False", 'languages'] = None
|
| 100 |
-
data.loc[data.languages == {}, 'languages'] = None
|
| 101 |
-
old_data.loc[old_data.languages == "False", 'languages'] = None
|
| 102 |
-
old_data.loc[old_data.languages == {}, 'languages'] = None
|
| 103 |
-
|
| 104 |
-
no_lang_count = data["languages"].isna().sum()
|
| 105 |
-
no_lang_count_old = old_data["languages"].isna().sum()
|
| 106 |
-
data["languages"] = data["languages"].fillna('none')
|
| 107 |
-
old_data["languages"] = old_data["languages"].fillna('none')
|
| 108 |
-
|
| 109 |
def make_list(row):
|
| 110 |
languages = row["languages"]
|
| 111 |
if languages == "none":
|
|
@@ -113,34 +75,86 @@ def main():
|
|
| 113 |
return literal_eval(languages)
|
| 114 |
|
| 115 |
def language_count(row):
|
| 116 |
-
|
| 117 |
-
leng = len(languages)
|
| 118 |
-
return leng
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
langs = langs[langs != {}]
|
| 128 |
-
total_langs = len(langs.unique())
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
| 134 |
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
with col1:
|
| 137 |
-
v = total_samples-no_lang_count
|
| 138 |
-
v_old = total_samples_old-no_lang_count_old
|
| 139 |
st.metric(label="Language Specified", value=v, delta=int(v-v_old))
|
| 140 |
with col2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
|
| 142 |
-
with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
st.subheader("Count of languages per model repo")
|
| 146 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
|
@@ -148,16 +162,21 @@ def main():
|
|
| 148 |
'All or just Multilingual',
|
| 149 |
["All", "Just Multilingual", "Three or more languages"])
|
| 150 |
|
| 151 |
-
filter = 0
|
| 152 |
-
st.text("Tofix: This just takes into account count of languages, it misses the multilingual tag")
|
| 153 |
-
if linguality == "Just Multilingual":
|
| 154 |
-
filter = 1
|
| 155 |
-
elif linguality == "Three or more languages":
|
| 156 |
-
filter = 2
|
| 157 |
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
df1 = models_with_langs['language_count'].value_counts()
|
| 160 |
-
models_with_langs_old = old_data[old_data["language_count"] > filter]
|
| 161 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
| 162 |
st.bar_chart(df1)
|
| 163 |
|
|
@@ -174,13 +193,13 @@ def main():
|
|
| 174 |
else:
|
| 175 |
filter = 2
|
| 176 |
|
| 177 |
-
models_with_langs =
|
| 178 |
langs = models_with_langs["languages"].explode()
|
| 179 |
langs = langs[langs != {}]
|
| 180 |
orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
| 181 |
d = orig_d
|
| 182 |
|
| 183 |
-
models_with_langs_old =
|
| 184 |
langs = models_with_langs_old["languages"].explode()
|
| 185 |
langs = langs[langs != {}]
|
| 186 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
|
@@ -212,6 +231,8 @@ def main():
|
|
| 212 |
final_data = pd.merge(
|
| 213 |
d, orig_d_old, how="outer", on="language"
|
| 214 |
)
|
|
|
|
|
|
|
| 215 |
final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
|
| 216 |
|
| 217 |
st.dataframe(final_data)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from ast import literal_eval
|
| 4 |
import altair as alt
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
|
| 7 |
+
from utils import process_dataset, eval_tags
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
def main():
|
| 10 |
+
# Pick revision at top
|
| 11 |
supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
|
| 12 |
+
col1, col2, col3 = st.columns(3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
with col1:
|
| 14 |
+
new = st.selectbox(
|
| 15 |
+
'Last revision',
|
| 16 |
+
supported_revisions,
|
| 17 |
+
index=0)
|
| 18 |
+
with col2:
|
| 19 |
base = st.selectbox(
|
| 20 |
'Old revision',
|
| 21 |
supported_revisions,
|
| 22 |
index=1)
|
| 23 |
+
with col3:
|
| 24 |
+
base_old = st.selectbox(
|
| 25 |
+
'Very old revision',
|
| 26 |
supported_revisions,
|
| 27 |
+
index=2)
|
| 28 |
+
|
| 29 |
+
def change_pct(old, new):
|
| 30 |
+
return round(100* (new - old) / new, 3)
|
| 31 |
+
|
| 32 |
+
def change_and_delta(old_old, old, new):
|
| 33 |
+
curr_change = change_pct(old, new)
|
| 34 |
+
prev_change = change_pct(old_old, old)
|
| 35 |
+
delta = f"{curr_change-prev_change}%"
|
| 36 |
+
curr_change = f"{curr_change}%"
|
| 37 |
+
return curr_change, delta
|
| 38 |
+
|
| 39 |
+
# Process dataset
|
| 40 |
+
old_old_data = process_dataset(base_old)
|
| 41 |
old_data = process_dataset(base)
|
| 42 |
data = process_dataset(new)
|
| 43 |
+
old_old_data["tags"] = old_old_data.apply(eval_tags, axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
old_data["tags"] = old_data.apply(eval_tags, axis=1)
|
| 45 |
data["tags"] = data.apply(eval_tags, axis=1)
|
| 46 |
|
| 47 |
+
# High level count of models and rate of change
|
| 48 |
+
total_samples_old_old = old_old_data.shape[0]
|
| 49 |
total_samples_old = old_data.shape[0]
|
| 50 |
total_samples = data.shape[0]
|
| 51 |
+
|
| 52 |
+
curr_change, delta = change_and_delta(total_samples_old_old, total_samples_old, total_samples)
|
| 53 |
+
|
| 54 |
+
col1, col2 = st.columns(2)
|
| 55 |
+
with col1:
|
| 56 |
+
st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
|
| 57 |
+
|
| 58 |
+
with col2:
|
| 59 |
+
st.metric(label="Rate of change", value=curr_change, delta=delta)
|
| 60 |
|
| 61 |
# Tabs don't work in Spaces st version
|
| 62 |
#tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
|
|
|
|
| 65 |
'Topic of interest',
|
| 66 |
["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
|
| 67 |
|
|
|
|
| 68 |
if tab == "Language":
|
| 69 |
st.header("Languages info")
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
def make_list(row):
|
| 72 |
languages = row["languages"]
|
| 73 |
if languages == "none":
|
|
|
|
| 75 |
return literal_eval(languages)
|
| 76 |
|
| 77 |
def language_count(row):
|
| 78 |
+
return len(row["languages"])
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
def process_for_lang(data):
|
| 81 |
+
# Remove rows without languages
|
| 82 |
+
data.loc[data.languages == "False", 'languages'] = None
|
| 83 |
+
data.loc[data.languages == {}, 'languages'] = None
|
| 84 |
|
| 85 |
+
# Count of rows that have no languages
|
| 86 |
+
no_lang_count = data["languages"].isna().sum()
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
# As the languages column might have multiple languages,
|
| 89 |
+
# we need to convert it to a list. We then count the number of languages.
|
| 90 |
+
data["languages"] = data["languages"].fillna('none')
|
| 91 |
+
data["languages"] = data.apply(make_list, axis=1)
|
| 92 |
+
data["language_count"] = data.apply(language_count, axis=1)
|
| 93 |
|
| 94 |
+
# Just keep the models with at least one language
|
| 95 |
+
models_with_langs = data[data["language_count"] > 0]
|
| 96 |
+
langs = models_with_langs["languages"].explode()
|
| 97 |
+
langs = langs[langs != {}]
|
| 98 |
+
total_langs = len(langs.unique())
|
| 99 |
+
|
| 100 |
+
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
|
| 101 |
+
|
| 102 |
+
return data, no_lang_count, total_langs, langs.unique()
|
| 103 |
+
|
| 104 |
+
filtered_data = data.copy()
|
| 105 |
+
old_filtered_data = old_data.copy()
|
| 106 |
+
old_old_filtered_data = old_old_data.copy()
|
| 107 |
+
|
| 108 |
+
modality = st.selectbox(
|
| 109 |
+
'Modalities',
|
| 110 |
+
["All", "NLP", "Audio", "Multimodal"])
|
| 111 |
+
|
| 112 |
+
if modality == "NLP":
|
| 113 |
+
filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
|
| 114 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
|
| 115 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
|
| 116 |
+
elif modality == "Audio":
|
| 117 |
+
filtered_data = filtered_data[filtered_data["modality"] == "audio"]
|
| 118 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
|
| 119 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
|
| 120 |
+
elif modality == "Multimodal":
|
| 121 |
+
filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
|
| 122 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
|
| 123 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
|
| 127 |
+
old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
|
| 128 |
+
old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
|
| 129 |
+
|
| 130 |
+
total_samples_filtered = filtered_data.shape[0]
|
| 131 |
+
total_samples_old_filtered = old_filtered_data.shape[0]
|
| 132 |
+
total_samples_old_old_filtered = old_old_filtered_data.shape[0]
|
| 133 |
+
v = total_samples_filtered-no_lang_count
|
| 134 |
+
v_old = total_samples_old_filtered-no_lang_count_old
|
| 135 |
+
v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
|
| 136 |
+
|
| 137 |
+
col1, col2 = st.columns(2)
|
| 138 |
with col1:
|
|
|
|
|
|
|
| 139 |
st.metric(label="Language Specified", value=v, delta=int(v-v_old))
|
| 140 |
with col2:
|
| 141 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
| 142 |
+
st.metric(label="Language Specified Rate of Change", value=curr_change, delta=delta)
|
| 143 |
+
|
| 144 |
+
col1, col2 = st.columns(2)
|
| 145 |
+
with col1:
|
| 146 |
st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
|
| 147 |
+
with col2:
|
| 148 |
+
curr_change, delta = change_and_delta(no_lang_count_old_old, no_lang_count_old, no_lang_count)
|
| 149 |
+
st.metric(label="No Language Specified Rate of Change", value=curr_change, delta=delta)
|
| 150 |
+
|
| 151 |
+
col1, col2 = st.columns(2)
|
| 152 |
+
with col1:
|
| 153 |
st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
|
| 154 |
+
with col2:
|
| 155 |
+
curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
|
| 156 |
+
st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
|
| 157 |
+
st.text(f"New languages {set(langs)-set(langs_old)}")
|
| 158 |
|
| 159 |
st.subheader("Count of languages per model repo")
|
| 160 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
|
|
|
| 162 |
'All or just Multilingual',
|
| 163 |
["All", "Just Multilingual", "Three or more languages"])
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
def filter_multilinguality(data):
|
| 167 |
+
if linguality == "Just Multilingual":
|
| 168 |
+
multilingual_tag = data["multilingual"] == 1
|
| 169 |
+
multiple_lang_tags = data["language_count"] > 1
|
| 170 |
+
return data[multilingual_tag | multiple_lang_tags]
|
| 171 |
+
elif linguality == "Three or more languages":
|
| 172 |
+
return data[data["language_count"] >= 3]
|
| 173 |
+
else:
|
| 174 |
+
return data
|
| 175 |
+
|
| 176 |
+
models_with_langs = filter_multilinguality(filtered_data)
|
| 177 |
+
models_with_langs_old = filter_multilinguality(old_filtered_data)
|
| 178 |
+
|
| 179 |
df1 = models_with_langs['language_count'].value_counts()
|
|
|
|
| 180 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
| 181 |
st.bar_chart(df1)
|
| 182 |
|
|
|
|
| 193 |
else:
|
| 194 |
filter = 2
|
| 195 |
|
| 196 |
+
models_with_langs = filtered_data[filtered_data["language_count"] > 0]
|
| 197 |
langs = models_with_langs["languages"].explode()
|
| 198 |
langs = langs[langs != {}]
|
| 199 |
orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
| 200 |
d = orig_d
|
| 201 |
|
| 202 |
+
models_with_langs_old = old_filtered_data[old_filtered_data["language_count"] > 0]
|
| 203 |
langs = models_with_langs_old["languages"].explode()
|
| 204 |
langs = langs[langs != {}]
|
| 205 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
|
|
|
| 231 |
final_data = pd.merge(
|
| 232 |
d, orig_d_old, how="outer", on="language"
|
| 233 |
)
|
| 234 |
+
print(final_data["counts"].isna().sum())
|
| 235 |
+
print(final_data["old_c"].isna().sum())
|
| 236 |
final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
|
| 237 |
|
| 238 |
st.dataframe(final_data)
|
utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from ast import literal_eval
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
|
| 8 |
+
"translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
|
| 9 |
+
]
|
| 10 |
+
audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
|
| 11 |
+
cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
|
| 12 |
+
multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
|
| 13 |
+
tabular = ["tabular-classification", "tabular-regression"]
|
| 14 |
+
|
| 15 |
+
modalities = {
|
| 16 |
+
"nlp": nlp_tasks,
|
| 17 |
+
"audio": audio_tasks,
|
| 18 |
+
"cv": cv_tasks,
|
| 19 |
+
"multimodal": multimodal,
|
| 20 |
+
"tabular": tabular,
|
| 21 |
+
"rl": ["reinforcement-learning"]
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
def modality(row):
|
| 25 |
+
pipeline = row["pipeline"]
|
| 26 |
+
for modality, tasks in modalities.items():
|
| 27 |
+
if pipeline in tasks:
|
| 28 |
+
return modality
|
| 29 |
+
if type(pipeline) == "str":
|
| 30 |
+
return "unk_modality"
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
st.cache(allow_output_mutation=True)
|
| 34 |
+
def process_dataset(version):
|
| 35 |
+
# Load dataset at specified revision
|
| 36 |
+
dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
|
| 37 |
+
|
| 38 |
+
# Convert to pandas dataframe
|
| 39 |
+
data = dataset["train"].to_pandas()
|
| 40 |
+
|
| 41 |
+
# Add modality column
|
| 42 |
+
data["modality"] = data.apply(modality, axis=1)
|
| 43 |
+
|
| 44 |
+
# Bin the model card length into some bins
|
| 45 |
+
data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
|
| 46 |
+
|
| 47 |
+
return data
|
| 48 |
+
|
| 49 |
+
def eval_tags(row):
|
| 50 |
+
tags = row["tags"]
|
| 51 |
+
if tags == "none" or tags == [] or tags == "{}":
|
| 52 |
+
return []
|
| 53 |
+
if tags[0] != "[":
|
| 54 |
+
tags = str([tags])
|
| 55 |
+
val = literal_eval(tags)
|
| 56 |
+
if isinstance(val, dict):
|
| 57 |
+
return []
|
| 58 |
+
return val
|
| 59 |
+
|
| 60 |
+
def change_pct(old, new):
|
| 61 |
+
return round(100* (new - old) / new, 3)
|
| 62 |
+
|
| 63 |
+
def change_and_delta(old_old, old, new):
|
| 64 |
+
curr_change = change_pct(old, new)
|
| 65 |
+
prev_change = change_pct(old_old, old)
|
| 66 |
+
delta = round(curr_change-prev_change, 3)
|
| 67 |
+
delta = f"{delta}%"
|
| 68 |
+
curr_change = f"{curr_change}%"
|
| 69 |
+
return curr_change, delta
|