Spaces:

open-source-metrics
/

models-explorer

Sleeping

App Files Files Community

osanseviero commited on Oct 24, 2022

Commit

6c21ae3

1 Parent(s): 33c8677

Add languages

Browse files

Files changed (4) hide show

changelog.md +7 -0
language.py +0 -0
models.py +126 -105
utils.py +69 -0

changelog.md CHANGED Viewed

@@ -1,5 +1,12 @@
 Changelog
 v0.1
 - Allow pick comparison version
 - Show delta in all metrics

 Changelog
+v0.2 - Oct 24
+- Languages
+    - Allow filtering for modality
+    - Show new languages for the diff
+    - Show rate of change in languages
+    - Also include multilingual tag as multilingual for model selection in languages
 v0.1
 - Allow pick comparison version
 - Show delta in all metrics

language.py ADDED Viewed

File without changes

models.py CHANGED Viewed

@@ -1,89 +1,62 @@
 import streamlit as st
 import pandas as pd
-from datasets import load_dataset
 from ast import literal_eval
 import altair as alt
-import plotly.graph_objs as go
 import matplotlib.pyplot as plt
-def main():
-    print("Build")
-    nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
-                "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
-    ]
-    audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
-    cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
-    multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
-    tabular = ["tabular-classification", "tabular-regression"]
-    modalities = {
-        "nlp": nlp_tasks,
-        "audio": audio_tasks,
-        "cv": cv_tasks,
-        "multimodal": multimodal,
-        "tabular": tabular,
-        "rl": ["reinforcement-learning"]
-    }
-    def modality(row):
-        pipeline = row["pipeline"]
-        for modality, tasks in modalities.items():
-            if pipeline in tasks:
-                return modality
-        if type(pipeline) == "str":
-            return "unk_modality"
-        return None
     supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
-    st.cache(allow_output_mutation=True)
-    def process_dataset(version):
-        # Load dataset at specified revision
-        dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
-        # Convert to pandas dataframe
-        data = dataset["train"].to_pandas()
-        # Add modality column
-        data["modality"] = data.apply(modality, axis=1)
-        # Bin the model card length into some bins
-        data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
-        return data
-    col1, col2 = st.columns(2)
     with col1:
         base = st.selectbox(
             'Old revision',
             supported_revisions,
             index=1)
-    with col2:
-        new = st.selectbox(
-            'Last revision',
             supported_revisions,
-            index=0)
     old_data = process_dataset(base)
     data = process_dataset(new)
-    def eval_tags(row):
-        tags = row["tags"]
-        if tags == "none" or tags == [] or tags == "{}":
-            return []
-        if tags[0] != "[":
-            tags = str([tags])
-        val = literal_eval(tags)
-        if isinstance(val, dict):
-            return []
-        return val
     old_data["tags"] = old_data.apply(eval_tags, axis=1)
     data["tags"] = data.apply(eval_tags, axis=1)
     total_samples_old = old_data.shape[0]
     total_samples = data.shape[0]
-    st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
     # Tabs don't work in Spaces st version
     #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
@@ -92,20 +65,9 @@ def main():
             'Topic of interest',
             ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
-    # with tab1:
     if tab == "Language":
         st.header("Languages info")
-        data.loc[data.languages == "False", 'languages'] = None
-        data.loc[data.languages == {}, 'languages'] = None
-        old_data.loc[old_data.languages == "False", 'languages'] = None
-        old_data.loc[old_data.languages == {}, 'languages'] = None
-        no_lang_count = data["languages"].isna().sum()
-        no_lang_count_old = old_data["languages"].isna().sum()
-        data["languages"] = data["languages"].fillna('none')
-        old_data["languages"] = old_data["languages"].fillna('none')
         def make_list(row):
             languages = row["languages"]
             if languages == "none":
@@ -113,34 +75,86 @@ def main():
             return literal_eval(languages)
         def language_count(row):
-            languages = row["languages"]
-            leng = len(languages)
-            return leng
-        data["languages"] = data.apply(make_list, axis=1)
-        data["language_count"] = data.apply(language_count, axis=1)
-        old_data["languages"] = old_data.apply(make_list, axis=1)
-        old_data["language_count"] = old_data.apply(language_count, axis=1)
-        models_with_langs = data[data["language_count"] > 0]
-        langs = models_with_langs["languages"].explode()
-        langs = langs[langs != {}]
-        total_langs = len(langs.unique())
-        models_with_langs_old = old_data[old_data["language_count"] > 0]
-        langs_old = models_with_langs_old["languages"].explode()
-        langs_old = langs_old[langs_old != {}]
-        total_langs_old = len(langs_old.unique())
-        col1, col2, col3 = st.columns(3)
         with col1:
-            v = total_samples-no_lang_count
-            v_old = total_samples_old-no_lang_count_old
             st.metric(label="Language Specified", value=v, delta=int(v-v_old))
         with col2:
             st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
-        with col3:
             st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
         st.subheader("Count of languages per model repo")
         st.text("Some repos are for multiple languages, so the count is greater than 1")
@@ -148,16 +162,21 @@ def main():
             'All or just Multilingual',
             ["All", "Just Multilingual", "Three or more languages"])
-        filter = 0
-        st.text("Tofix: This just takes into account count of languages, it misses the multilingual tag")
-        if linguality == "Just Multilingual":
-            filter = 1
-        elif linguality == "Three or more languages":
-            filter = 2
-        models_with_langs = data[data["language_count"] > filter]
         df1 = models_with_langs['language_count'].value_counts()
-        models_with_langs_old = old_data[old_data["language_count"] > filter]
         df1_old = models_with_langs_old['language_count'].value_counts()
         st.bar_chart(df1)
@@ -174,13 +193,13 @@ def main():
         else:
             filter = 2
-        models_with_langs = data[data["language_count"] > 0]
         langs = models_with_langs["languages"].explode()
         langs = langs[langs != {}]
         orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
         d = orig_d
-        models_with_langs_old = old_data[old_data["language_count"] > 0]
         langs = models_with_langs_old["languages"].explode()
         langs = langs[langs != {}]
         orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
@@ -212,6 +231,8 @@ def main():
         final_data =  pd.merge(
             d, orig_d_old, how="outer", on="language"
         )
         final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
         st.dataframe(final_data)

 import streamlit as st
 import pandas as pd
 from ast import literal_eval
 import altair as alt
 import matplotlib.pyplot as plt
+from utils import process_dataset, eval_tags
+def main():
+    # Pick revision at top
     supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
+    col1, col2, col3 = st.columns(3)
     with col1:
+        new = st.selectbox(
+            'Last revision',
+            supported_revisions,
+            index=0)
+    with col2:
         base = st.selectbox(
             'Old revision',
             supported_revisions,
             index=1)
+    with col3:
+        base_old = st.selectbox(
+            'Very old revision',
             supported_revisions,
+            index=2)
+    def change_pct(old, new):
+        return round(100* (new - old) / new, 3)
+    def change_and_delta(old_old, old, new):
+        curr_change = change_pct(old, new)
+        prev_change = change_pct(old_old, old)
+        delta = f"{curr_change-prev_change}%"
+        curr_change = f"{curr_change}%"
+        return curr_change, delta
+    # Process dataset
+    old_old_data = process_dataset(base_old)
     old_data = process_dataset(base)
     data = process_dataset(new)
+    old_old_data["tags"] = old_old_data.apply(eval_tags, axis=1)
     old_data["tags"] = old_data.apply(eval_tags, axis=1)
     data["tags"] = data.apply(eval_tags, axis=1)
+    # High level count of models and rate of change
+    total_samples_old_old = old_old_data.shape[0]
     total_samples_old = old_data.shape[0]
     total_samples = data.shape[0]
+    curr_change, delta = change_and_delta(total_samples_old_old, total_samples_old, total_samples)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
+    with col2:
+        st.metric(label="Rate of change", value=curr_change, delta=delta)
     # Tabs don't work in Spaces st version
     #tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
             'Topic of interest',
             ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
     if tab == "Language":
         st.header("Languages info")
         def make_list(row):
             languages = row["languages"]
             if languages == "none":
             return literal_eval(languages)
         def language_count(row):
+            return len(row["languages"])
+        def process_for_lang(data):
+            # Remove rows without languages
+            data.loc[data.languages == "False", 'languages'] = None
+            data.loc[data.languages == {}, 'languages'] = None
+            # Count of rows that have no languages
+            no_lang_count = data["languages"].isna().sum()
+            # As the languages column might have multiple languages,
+            # we need to convert it to a list. We then count the number of languages.
+            data["languages"] = data["languages"].fillna('none')
+            data["languages"] = data.apply(make_list, axis=1)
+            data["language_count"] = data.apply(language_count, axis=1)
+            # Just keep the models with at least one language
+            models_with_langs = data[data["language_count"] > 0]
+            langs = models_with_langs["languages"].explode()
+            langs = langs[langs != {}]
+            total_langs = len(langs.unique())
+            data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
+            return data, no_lang_count, total_langs, langs.unique()
+        filtered_data = data.copy()
+        old_filtered_data = old_data.copy()
+        old_old_filtered_data = old_old_data.copy()
+        modality = st.selectbox(
+            'Modalities',
+            ["All", "NLP", "Audio", "Multimodal"])
+        if modality == "NLP":
+            filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
+            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
+            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
+        elif modality == "Audio":
+            filtered_data = filtered_data[filtered_data["modality"] == "audio"]
+            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
+            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
+        elif modality == "Multimodal":
+            filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
+            old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
+            old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
+        filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
+        old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
+        old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
+        total_samples_filtered = filtered_data.shape[0]
+        total_samples_old_filtered = old_filtered_data.shape[0]
+        total_samples_old_old_filtered = old_old_filtered_data.shape[0]
+        v = total_samples_filtered-no_lang_count
+        v_old = total_samples_old_filtered-no_lang_count_old
+        v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
+        col1, col2 = st.columns(2)
         with col1:
             st.metric(label="Language Specified", value=v, delta=int(v-v_old))
         with col2:
+            curr_change, delta = change_and_delta(v_old_old, v_old, v)
+            st.metric(label="Language Specified Rate of Change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        with col1:
             st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
+        with col2:
+            curr_change, delta = change_and_delta(no_lang_count_old_old, no_lang_count_old, no_lang_count)
+            st.metric(label="No Language Specified Rate of Change", value=curr_change, delta=delta)
+        col1, col2 = st.columns(2)
+        with col1:
             st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
+        with col2:
+            curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
+            st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
+        st.text(f"New languages {set(langs)-set(langs_old)}")
         st.subheader("Count of languages per model repo")
         st.text("Some repos are for multiple languages, so the count is greater than 1")
             'All or just Multilingual',
             ["All", "Just Multilingual", "Three or more languages"])
+        def filter_multilinguality(data):
+            if linguality == "Just Multilingual":
+                multilingual_tag = data["multilingual"] == 1
+                multiple_lang_tags = data["language_count"] > 1
+                return data[multilingual_tag | multiple_lang_tags]
+            elif linguality == "Three or more languages":
+                return data[data["language_count"] >= 3]
+            else:
+                return data
+        models_with_langs = filter_multilinguality(filtered_data)
+        models_with_langs_old = filter_multilinguality(old_filtered_data)
         df1 = models_with_langs['language_count'].value_counts()
         df1_old = models_with_langs_old['language_count'].value_counts()
         st.bar_chart(df1)
         else:
             filter = 2
+        models_with_langs = filtered_data[filtered_data["language_count"] > 0]
         langs = models_with_langs["languages"].explode()
         langs = langs[langs != {}]
         orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
         d = orig_d
+        models_with_langs_old = old_filtered_data[old_filtered_data["language_count"] > 0]
         langs = models_with_langs_old["languages"].explode()
         langs = langs[langs != {}]
         orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
         final_data =  pd.merge(
             d, orig_d_old, how="outer", on="language"
         )
+        print(final_data["counts"].isna().sum())
+        print(final_data["old_c"].isna().sum())
         final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
         st.dataframe(final_data)

utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from datasets import load_dataset
+import streamlit as st
+from ast import literal_eval
+import pandas as pd
+nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
+            "translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
+]
+audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
+cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
+multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
+tabular = ["tabular-classification", "tabular-regression"]
+modalities = {
+    "nlp": nlp_tasks,
+    "audio": audio_tasks,
+    "cv": cv_tasks,
+    "multimodal": multimodal,
+    "tabular": tabular,
+    "rl": ["reinforcement-learning"]
+}
+def modality(row):
+    pipeline = row["pipeline"]
+    for modality, tasks in modalities.items():
+        if pipeline in tasks:
+            return modality
+    if type(pipeline) == "str":
+        return "unk_modality"
+    return None
+st.cache(allow_output_mutation=True)
+def process_dataset(version):
+    # Load dataset at specified revision
+    dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
+    # Convert to pandas dataframe
+    data = dataset["train"].to_pandas()
+    # Add modality column
+    data["modality"] = data.apply(modality, axis=1)
+    # Bin the model card length into some bins
+    data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
+    return data
+def eval_tags(row):
+    tags = row["tags"]
+    if tags == "none" or tags == [] or tags == "{}":
+        return []
+    if tags[0] != "[":
+        tags = str([tags])
+    val = literal_eval(tags)
+    if isinstance(val, dict):
+        return []
+    return val
+def change_pct(old, new):
+    return round(100* (new - old) / new, 3)
+def change_and_delta(old_old, old, new):
+    curr_change = change_pct(old, new)
+    prev_change = change_pct(old_old, old)
+    delta = round(curr_change-prev_change, 3)
+    delta = f"{delta}%"
+    curr_change = f"{curr_change}%"
+    return curr_change, delta