I tried optimizing the scraping code but realized it is worse. Back to old code
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import gradio as gr
|
|
| 9 |
|
| 10 |
api = HfApi()
|
| 11 |
|
| 12 |
-
def
|
| 13 |
all_list = []
|
| 14 |
if which_one == "models":
|
| 15 |
things = api.list_models(author=org_name)
|
|
@@ -73,49 +73,15 @@ def get_ranking(model_list, target_org):
|
|
| 73 |
return [index+1, model]
|
| 74 |
return "Not Found"
|
| 75 |
|
| 76 |
-
|
| 77 |
-
def get_models(which_one):
|
| 78 |
-
if which_one == "models":
|
| 79 |
-
data = api.list_models()
|
| 80 |
-
elif which_one == "datasets":
|
| 81 |
-
data = api.list_datasets()
|
| 82 |
-
elif which_one == "spaces":
|
| 83 |
-
data = api.list_spaces()
|
| 84 |
-
|
| 85 |
-
all_list = []
|
| 86 |
-
for i in tqdm(data, desc=f"Scraping {which_one}", position=0, leave=True):
|
| 87 |
-
i = i.__dict__
|
| 88 |
-
|
| 89 |
-
id = i["id"].split("/")
|
| 90 |
-
if len(id) != 1:
|
| 91 |
-
json_format_data = {"author": id[0] ,"id": "/".join(id), "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"author": id[0] ,"id": "/".join(id), "downloads": 0, "likes": i['likes']}
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
all_list.append(json_format_data)
|
| 95 |
-
return all_list
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def search(data, author_name):
|
| 99 |
-
matching_authors = []
|
| 100 |
-
for entry in data:
|
| 101 |
-
if entry['author'] == author_name:
|
| 102 |
-
matching_authors.append(entry)
|
| 103 |
-
|
| 104 |
-
data_frame = pd.DataFrame(matching_authors)
|
| 105 |
-
return data_frame
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
def make_leaderboard(orgs, which_one, data):
|
| 109 |
data_rows = []
|
| 110 |
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
|
| 111 |
|
| 112 |
trend = get_trending_list(1, which_one)
|
| 113 |
|
| 114 |
-
for org in tqdm(orgs, desc=f"
|
| 115 |
rank = get_ranking_trend(trend, org)
|
| 116 |
-
|
| 117 |
-
df = search(data, org)
|
| 118 |
-
|
| 119 |
if len(df) == 0:
|
| 120 |
continue
|
| 121 |
num_things = len(df)
|
|
@@ -178,6 +144,8 @@ def make_leaderboard(orgs, which_one, data):
|
|
| 178 |
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
|
| 179 |
return leaderboard
|
| 180 |
|
|
|
|
|
|
|
| 181 |
|
| 182 |
with open("org_names.txt", "r") as f:
|
| 183 |
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
|
|
@@ -185,23 +153,14 @@ with open("org_names.txt", "r") as f:
|
|
| 185 |
|
| 186 |
INTRODUCTION_TEXT = f"""
|
| 187 |
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
| 188 |
-
|
| 189 |
## Available Dataframes:
|
| 190 |
-
|
| 191 |
- 🏛️ Models
|
| 192 |
-
|
| 193 |
- 📊 Datasets
|
| 194 |
-
|
| 195 |
- 🚀 Spaces
|
| 196 |
-
|
| 197 |
## Backend
|
| 198 |
-
|
| 199 |
🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
|
| 200 |
-
|
| 201 |
🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
|
| 202 |
-
|
| 203 |
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
|
| 204 |
-
|
| 205 |
**🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
|
| 206 |
"""
|
| 207 |
|
|
@@ -258,14 +217,10 @@ with gr.Blocks() as demo:
|
|
| 258 |
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
|
| 259 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 260 |
|
| 261 |
-
all_models = get_models("models")
|
| 262 |
-
all_datasets = get_models("datasets")
|
| 263 |
-
all_spaces = get_models("spaces")
|
| 264 |
-
|
| 265 |
-
|
| 266 |
with gr.TabItem("🏛️ Models", id=1):
|
|
|
|
| 267 |
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
|
| 268 |
-
models_df = make_leaderboard(org_names_in_list, "models"
|
| 269 |
models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
|
| 270 |
|
| 271 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
|
|
@@ -273,7 +228,7 @@ with gr.Blocks() as demo:
|
|
| 273 |
|
| 274 |
with gr.TabItem("📊 Datasets", id=2):
|
| 275 |
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
|
| 276 |
-
dataset_df = make_leaderboard(org_names_in_list, "datasets"
|
| 277 |
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
|
| 278 |
|
| 279 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
|
|
@@ -282,10 +237,11 @@ with gr.Blocks() as demo:
|
|
| 282 |
with gr.TabItem("🚀 Spaces", id=3):
|
| 283 |
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
|
| 284 |
|
| 285 |
-
spaces_df = make_leaderboard(org_names_in_list, "spaces"
|
| 286 |
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
|
| 287 |
|
| 288 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
|
| 289 |
gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
|
| 290 |
|
| 291 |
demo.launch()
|
|
|
|
|
|
| 9 |
|
| 10 |
api = HfApi()
|
| 11 |
|
| 12 |
+
def get_models(org_name, which_one):
|
| 13 |
all_list = []
|
| 14 |
if which_one == "models":
|
| 15 |
things = api.list_models(author=org_name)
|
|
|
|
| 73 |
return [index+1, model]
|
| 74 |
return "Not Found"
|
| 75 |
|
| 76 |
+
def make_leaderboard(orgs, which_one):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
data_rows = []
|
| 78 |
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
|
| 79 |
|
| 80 |
trend = get_trending_list(1, which_one)
|
| 81 |
|
| 82 |
+
for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True):
|
| 83 |
rank = get_ranking_trend(trend, org)
|
| 84 |
+
df = get_models(org, which_one)
|
|
|
|
|
|
|
| 85 |
if len(df) == 0:
|
| 86 |
continue
|
| 87 |
num_things = len(df)
|
|
|
|
| 144 |
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
|
| 145 |
return leaderboard
|
| 146 |
|
| 147 |
+
"""# Gradio başlasın
|
| 148 |
+
"""
|
| 149 |
|
| 150 |
with open("org_names.txt", "r") as f:
|
| 151 |
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
|
|
|
|
| 153 |
|
| 154 |
INTRODUCTION_TEXT = f"""
|
| 155 |
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
|
|
|
| 156 |
## Available Dataframes:
|
|
|
|
| 157 |
- 🏛️ Models
|
|
|
|
| 158 |
- 📊 Datasets
|
|
|
|
| 159 |
- 🚀 Spaces
|
|
|
|
| 160 |
## Backend
|
|
|
|
| 161 |
🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
|
|
|
|
| 162 |
🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
|
|
|
|
| 163 |
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
|
|
|
|
| 164 |
**🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
|
| 165 |
"""
|
| 166 |
|
|
|
|
| 217 |
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
|
| 218 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
with gr.TabItem("🏛️ Models", id=1):
|
| 221 |
+
|
| 222 |
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
|
| 223 |
+
models_df = make_leaderboard(org_names_in_list, "models")
|
| 224 |
models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
|
| 225 |
|
| 226 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
|
|
|
|
| 228 |
|
| 229 |
with gr.TabItem("📊 Datasets", id=2):
|
| 230 |
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
|
| 231 |
+
dataset_df = make_leaderboard(org_names_in_list, "datasets")
|
| 232 |
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
|
| 233 |
|
| 234 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
|
|
|
|
| 237 |
with gr.TabItem("🚀 Spaces", id=3):
|
| 238 |
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
|
| 239 |
|
| 240 |
+
spaces_df = make_leaderboard(org_names_in_list, "spaces")
|
| 241 |
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
|
| 242 |
|
| 243 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
|
| 244 |
gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
|
| 245 |
|
| 246 |
demo.launch()
|
| 247 |
+
|