I tried optimizing the scraping code but realized it is worse. Back to old code
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ import gradio as gr
|
|
9 |
|
10 |
api = HfApi()
|
11 |
|
12 |
-
def
|
13 |
all_list = []
|
14 |
if which_one == "models":
|
15 |
things = api.list_models(author=org_name)
|
@@ -73,49 +73,15 @@ def get_ranking(model_list, target_org):
|
|
73 |
return [index+1, model]
|
74 |
return "Not Found"
|
75 |
|
76 |
-
|
77 |
-
def get_models(which_one):
|
78 |
-
if which_one == "models":
|
79 |
-
data = api.list_models()
|
80 |
-
elif which_one == "datasets":
|
81 |
-
data = api.list_datasets()
|
82 |
-
elif which_one == "spaces":
|
83 |
-
data = api.list_spaces()
|
84 |
-
|
85 |
-
all_list = []
|
86 |
-
for i in tqdm(data, desc=f"Scraping {which_one}", position=0, leave=True):
|
87 |
-
i = i.__dict__
|
88 |
-
|
89 |
-
id = i["id"].split("/")
|
90 |
-
if len(id) != 1:
|
91 |
-
json_format_data = {"author": id[0] ,"id": "/".join(id), "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"author": id[0] ,"id": "/".join(id), "downloads": 0, "likes": i['likes']}
|
92 |
-
|
93 |
-
|
94 |
-
all_list.append(json_format_data)
|
95 |
-
return all_list
|
96 |
-
|
97 |
-
|
98 |
-
def search(data, author_name):
|
99 |
-
matching_authors = []
|
100 |
-
for entry in data:
|
101 |
-
if entry['author'] == author_name:
|
102 |
-
matching_authors.append(entry)
|
103 |
-
|
104 |
-
data_frame = pd.DataFrame(matching_authors)
|
105 |
-
return data_frame
|
106 |
-
|
107 |
-
|
108 |
-
def make_leaderboard(orgs, which_one, data):
|
109 |
data_rows = []
|
110 |
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
|
111 |
|
112 |
trend = get_trending_list(1, which_one)
|
113 |
|
114 |
-
for org in tqdm(orgs, desc=f"
|
115 |
rank = get_ranking_trend(trend, org)
|
116 |
-
|
117 |
-
df = search(data, org)
|
118 |
-
|
119 |
if len(df) == 0:
|
120 |
continue
|
121 |
num_things = len(df)
|
@@ -178,6 +144,8 @@ def make_leaderboard(orgs, which_one, data):
|
|
178 |
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
|
179 |
return leaderboard
|
180 |
|
|
|
|
|
181 |
|
182 |
with open("org_names.txt", "r") as f:
|
183 |
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
|
@@ -185,23 +153,14 @@ with open("org_names.txt", "r") as f:
|
|
185 |
|
186 |
INTRODUCTION_TEXT = f"""
|
187 |
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
188 |
-
|
189 |
## Available Dataframes:
|
190 |
-
|
191 |
- 🏛️ Models
|
192 |
-
|
193 |
- 📊 Datasets
|
194 |
-
|
195 |
- 🚀 Spaces
|
196 |
-
|
197 |
## Backend
|
198 |
-
|
199 |
🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
|
200 |
-
|
201 |
🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
|
202 |
-
|
203 |
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
|
204 |
-
|
205 |
**🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
|
206 |
"""
|
207 |
|
@@ -258,14 +217,10 @@ with gr.Blocks() as demo:
|
|
258 |
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
|
259 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
260 |
|
261 |
-
all_models = get_models("models")
|
262 |
-
all_datasets = get_models("datasets")
|
263 |
-
all_spaces = get_models("spaces")
|
264 |
-
|
265 |
-
|
266 |
with gr.TabItem("🏛️ Models", id=1):
|
|
|
267 |
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
|
268 |
-
models_df = make_leaderboard(org_names_in_list, "models"
|
269 |
models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
|
270 |
|
271 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
|
@@ -273,7 +228,7 @@ with gr.Blocks() as demo:
|
|
273 |
|
274 |
with gr.TabItem("📊 Datasets", id=2):
|
275 |
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
|
276 |
-
dataset_df = make_leaderboard(org_names_in_list, "datasets"
|
277 |
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
|
278 |
|
279 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
|
@@ -282,10 +237,11 @@ with gr.Blocks() as demo:
|
|
282 |
with gr.TabItem("🚀 Spaces", id=3):
|
283 |
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
|
284 |
|
285 |
-
spaces_df = make_leaderboard(org_names_in_list, "spaces"
|
286 |
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
|
287 |
|
288 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
|
289 |
gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
|
290 |
|
291 |
demo.launch()
|
|
|
|
9 |
|
10 |
api = HfApi()
|
11 |
|
12 |
+
def get_models(org_name, which_one):
|
13 |
all_list = []
|
14 |
if which_one == "models":
|
15 |
things = api.list_models(author=org_name)
|
|
|
73 |
return [index+1, model]
|
74 |
return "Not Found"
|
75 |
|
76 |
+
def make_leaderboard(orgs, which_one):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
data_rows = []
|
78 |
open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
|
79 |
|
80 |
trend = get_trending_list(1, which_one)
|
81 |
|
82 |
+
for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True):
|
83 |
rank = get_ranking_trend(trend, org)
|
84 |
+
df = get_models(org, which_one)
|
|
|
|
|
85 |
if len(df) == 0:
|
86 |
continue
|
87 |
num_things = len(df)
|
|
|
144 |
leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
|
145 |
return leaderboard
|
146 |
|
147 |
+
"""# Gradio başlasın
|
148 |
+
"""
|
149 |
|
150 |
with open("org_names.txt", "r") as f:
|
151 |
org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
|
|
|
153 |
|
154 |
INTRODUCTION_TEXT = f"""
|
155 |
🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
|
|
|
156 |
## Available Dataframes:
|
|
|
157 |
- 🏛️ Models
|
|
|
158 |
- 📊 Datasets
|
|
|
159 |
- 🚀 Spaces
|
|
|
160 |
## Backend
|
|
|
161 |
🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
|
|
|
162 |
🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
|
|
|
163 |
**🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
|
|
|
164 |
**🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
|
165 |
"""
|
166 |
|
|
|
217 |
gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
|
218 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
219 |
|
|
|
|
|
|
|
|
|
|
|
220 |
with gr.TabItem("🏛️ Models", id=1):
|
221 |
+
|
222 |
columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
|
223 |
+
models_df = make_leaderboard(org_names_in_list, "models")
|
224 |
models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
|
225 |
|
226 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
|
|
|
228 |
|
229 |
with gr.TabItem("📊 Datasets", id=2):
|
230 |
columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
|
231 |
+
dataset_df = make_leaderboard(org_names_in_list, "datasets")
|
232 |
dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
|
233 |
|
234 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
|
|
|
237 |
with gr.TabItem("🚀 Spaces", id=3):
|
238 |
columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
|
239 |
|
240 |
+
spaces_df = make_leaderboard(org_names_in_list, "spaces")
|
241 |
spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
|
242 |
|
243 |
headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
|
244 |
gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
|
245 |
|
246 |
demo.launch()
|
247 |
+
|