Weyaxi commited on
Commit
9df3444
1 Parent(s): 490b824

I tried optimizing the scraping code but realized it is worse. Back to old code

Browse files
Files changed (1) hide show
  1. app.py +11 -55
app.py CHANGED
@@ -9,7 +9,7 @@ import gradio as gr
9
 
10
  api = HfApi()
11
 
12
- def get_models_old(org_name, which_one):
13
  all_list = []
14
  if which_one == "models":
15
  things = api.list_models(author=org_name)
@@ -73,49 +73,15 @@ def get_ranking(model_list, target_org):
73
  return [index+1, model]
74
  return "Not Found"
75
 
76
-
77
- def get_models(which_one):
78
- if which_one == "models":
79
- data = api.list_models()
80
- elif which_one == "datasets":
81
- data = api.list_datasets()
82
- elif which_one == "spaces":
83
- data = api.list_spaces()
84
-
85
- all_list = []
86
- for i in tqdm(data, desc=f"Scraping {which_one}", position=0, leave=True):
87
- i = i.__dict__
88
-
89
- id = i["id"].split("/")
90
- if len(id) != 1:
91
- json_format_data = {"author": id[0] ,"id": "/".join(id), "downloads": i['downloads'], "likes": i['likes']} if which_one != "spaces" else {"author": id[0] ,"id": "/".join(id), "downloads": 0, "likes": i['likes']}
92
-
93
-
94
- all_list.append(json_format_data)
95
- return all_list
96
-
97
-
98
- def search(data, author_name):
99
- matching_authors = []
100
- for entry in data:
101
- if entry['author'] == author_name:
102
- matching_authors.append(entry)
103
-
104
- data_frame = pd.DataFrame(matching_authors)
105
- return data_frame
106
-
107
-
108
- def make_leaderboard(orgs, which_one, data):
109
  data_rows = []
110
  open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
111
 
112
  trend = get_trending_list(1, which_one)
113
 
114
- for org in tqdm(orgs, desc=f"Proccesing Organizations ({which_one})", position=0, leave=True):
115
  rank = get_ranking_trend(trend, org)
116
-
117
- df = search(data, org)
118
-
119
  if len(df) == 0:
120
  continue
121
  num_things = len(df)
@@ -178,6 +144,8 @@ def make_leaderboard(orgs, which_one, data):
178
  leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
179
  return leaderboard
180
 
 
 
181
 
182
  with open("org_names.txt", "r") as f:
183
  org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
@@ -185,23 +153,14 @@ with open("org_names.txt", "r") as f:
185
 
186
  INTRODUCTION_TEXT = f"""
187
  🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
188
-
189
  ## Available Dataframes:
190
-
191
  - 🏛️ Models
192
-
193
  - 📊 Datasets
194
-
195
  - 🚀 Spaces
196
-
197
  ## Backend
198
-
199
  🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
200
-
201
  🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
202
-
203
  **🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
204
-
205
  **🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
206
  """
207
 
@@ -258,14 +217,10 @@ with gr.Blocks() as demo:
258
  gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
259
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
260
 
261
- all_models = get_models("models")
262
- all_datasets = get_models("datasets")
263
- all_spaces = get_models("spaces")
264
-
265
-
266
  with gr.TabItem("🏛️ Models", id=1):
 
267
  columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
268
- models_df = make_leaderboard(org_names_in_list, "models", all_models)
269
  models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
270
 
271
  headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
@@ -273,7 +228,7 @@ with gr.Blocks() as demo:
273
 
274
  with gr.TabItem("📊 Datasets", id=2):
275
  columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
276
- dataset_df = make_leaderboard(org_names_in_list, "datasets", all_datasets)
277
  dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
278
 
279
  headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
@@ -282,10 +237,11 @@ with gr.Blocks() as demo:
282
  with gr.TabItem("🚀 Spaces", id=3):
283
  columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
284
 
285
- spaces_df = make_leaderboard(org_names_in_list, "spaces", all_spaces)
286
  spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
287
 
288
  headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
289
  gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
290
 
291
  demo.launch()
 
 
9
 
10
  api = HfApi()
11
 
12
+ def get_models(org_name, which_one):
13
  all_list = []
14
  if which_one == "models":
15
  things = api.list_models(author=org_name)
 
73
  return [index+1, model]
74
  return "Not Found"
75
 
76
+ def make_leaderboard(orgs, which_one):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  data_rows = []
78
  open_llm_leaderboard = get_openllm_leaderboard() if which_one == "models" else None
79
 
80
  trend = get_trending_list(1, which_one)
81
 
82
+ for org in tqdm(orgs, desc=f"Scraping Organizations ({which_one})", position=0, leave=True):
83
  rank = get_ranking_trend(trend, org)
84
+ df = get_models(org, which_one)
 
 
85
  if len(df) == 0:
86
  continue
87
  num_things = len(df)
 
144
  leaderboard.insert(0, "Serial Number", range(1, len(leaderboard) + 1))
145
  return leaderboard
146
 
147
+ """# Gradio başlasın
148
+ """
149
 
150
  with open("org_names.txt", "r") as f:
151
  org_names_in_list = [i.rstrip("\n") for i in f.readlines()]
 
153
 
154
  INTRODUCTION_TEXT = f"""
155
  🎯 The Organization Leaderboard aims to track organization rankings. This space is inspired by the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
 
156
  ## Available Dataframes:
 
157
  - 🏛️ Models
 
158
  - 📊 Datasets
 
159
  - 🚀 Spaces
 
160
  ## Backend
 
161
  🛠️ The leaderboard's backend mainly runs on the [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/v0.5.1/en/package_reference/hf_api).
 
162
  🛠️ Organization names are retrieved using web scraping from [Huggingface Organizations](https://huggingface.co/organizations).
 
163
  **🌐 Note:** In the model's dataframe, there are some columns related to the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). This data is also retrieved through web scraping.
 
164
  **🌐 Note:** In trending models, first 300 models/datasets/spaces is being retrieved from huggingface.
165
  """
166
 
 
217
  gr.Markdown("""<h1 align="center" id="space-title">🤗 Organization Leaderboard</h1>""")
218
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
219
 
 
 
 
 
 
220
  with gr.TabItem("🏛️ Models", id=1):
221
+
222
  columns_to_convert = ["Organization Name", "Best Model On Open LLM Leaderboard", "Most Downloaded Model", "Most Liked Model", "Trending Model"]
223
+ models_df = make_leaderboard(org_names_in_list, "models")
224
  models_df = models_df_to_clickable(models_df, columns_to_convert, "models")
225
 
226
  headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "🤖 Number of Models", "🏆 Best Model On Open LLM Leaderboard", "🥇 Best Rank On Open LLM Leaderboard", "📊 Average Downloads per Model", "📈 Average Likes per Model", "🚀 Most Downloaded Model", "📈 Most Download Count", "❤️ Most Liked Model", "👍 Most Like Count", "🔥 Trending Model", "👑 Best Rank at Trending Models"]
 
228
 
229
  with gr.TabItem("📊 Datasets", id=2):
230
  columns_to_convert = ["Organization Name", "Most Downloaded Dataset", "Most Liked Dataset", "Trending Dataset"]
231
+ dataset_df = make_leaderboard(org_names_in_list, "datasets")
232
  dataset_df = models_df_to_clickable(dataset_df, columns_to_convert, "datasets")
233
 
234
  headers = ["🔢 Serial Number", "🏢 Organization Name", "📥 Total Downloads", "👍 Total Likes", "📊 Number of Datasets", "📊 Average Downloads per Dataset", "📈 Average Likes per Dataset", "🚀 Most Downloaded Dataset", "📈 Most Download Count", "❤️ Most Liked Dataset", "👍 Most Like Count", "🔥 Trending Dataset", "👑 Best Rank at Trending Datasets"]
 
237
  with gr.TabItem("🚀 Spaces", id=3):
238
  columns_to_convert = ["Organization Name", "Most Liked Space", "Trending Space"]
239
 
240
+ spaces_df = make_leaderboard(org_names_in_list, "spaces")
241
  spaces_df = models_df_to_clickable(spaces_df, columns_to_convert, "spaces")
242
 
243
  headers = ["🔢 Serial Number", "🏢 Organization Name", "👍 Total Likes", "🚀 Number of Spaces", "📈 Average Likes per Space", "❤️ Most Liked Space", "👍 Most Like Count", "🔥 Trending Space", "👑 Best Rank at Trending Spaces"]
244
  gr.Dataframe(spaces_df.head(150), headers=headers, interactive=False, datatype=["str", "markdown", "str", "str", "str", "markdown", "str", "markdown", "str"])
245
 
246
  demo.launch()
247
+