Spaces:
Runtime error
Runtime error
nkasmanoff
commited on
Commit
•
55ee8cd
1
Parent(s):
06758b6
Create helpers.py
Browse files- helpers.py +50 -0
helpers.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
|
3 |
+
def clean_up_tags(tags_list):
|
4 |
+
tags_cleaned = []
|
5 |
+
for tag in tags_list:
|
6 |
+
if ':' in tag:
|
7 |
+
tag = tag.split(':')[1]
|
8 |
+
|
9 |
+
tags_cleaned.append(tag)
|
10 |
+
|
11 |
+
return ", ".join(tags_cleaned)
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
def check_api_url(url):
|
16 |
+
"""
|
17 |
+
This function checks to see if "api" is present in the URL between ".co" and "/datasets". If not, it inserts "api" in the correct position.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
url (str): A URL string
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
str: A URL string with "api" inserted if necessary
|
24 |
+
"""
|
25 |
+
# Split the URL into three parts based on the location of ".co" and "/datasets"
|
26 |
+
parts = url.split(".co")
|
27 |
+
first_part = parts[0] + ".co"
|
28 |
+
last_part = parts[1]
|
29 |
+
last_parts = last_part.split("/datasets")
|
30 |
+
middle_part = ""
|
31 |
+
if len(last_parts) > 1 and "/api" not in last_parts[0]:
|
32 |
+
middle_part = "/api"
|
33 |
+
# Concatenate the three parts to form the final URL
|
34 |
+
new_url = first_part + middle_part + last_parts[0] + "/datasets" + last_parts[1]
|
35 |
+
return new_url
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
def get_dataset_metadata(dataset_url):
|
40 |
+
retrieved_metadata = {}
|
41 |
+
dataset_url = check_api_url(dataset_url)
|
42 |
+
keys_to_retrieve = ['id','description', 'tags']
|
43 |
+
response = requests.get(dataset_url)
|
44 |
+
if response.status_code == 200:
|
45 |
+
response_json = response.json()
|
46 |
+
for key in keys_to_retrieve:
|
47 |
+
if key in response_json:
|
48 |
+
retrieved_metadata[key] = response_json[key]
|
49 |
+
|
50 |
+
return retrieved_metadata
|