Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- Dockerfile +1 -0
- core/files.py +40 -12
- core/files_test.py +15 -6
- core/past_projects.py +6 -1
- requirements.txt +1 -0
- views/files.py +10 -11
- views/record_sets.py +4 -1
- views/splash.py +20 -10
Dockerfile
CHANGED
@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
|
|
10 |
software-properties-common \
|
11 |
git \
|
12 |
python3-pip \
|
|
|
13 |
&& rm -rf /var/lib/apt/lists/*
|
14 |
|
15 |
COPY ./ /app/
|
|
|
10 |
software-properties-common \
|
11 |
git \
|
12 |
python3-pip \
|
13 |
+
libmagic1 \
|
14 |
&& rm -rf /var/lib/apt/lists/*
|
15 |
|
16 |
COPY ./ /app/
|
core/files.py
CHANGED
@@ -4,6 +4,7 @@ import io
|
|
4 |
import tempfile
|
5 |
|
6 |
from etils import epath
|
|
|
7 |
import pandas as pd
|
8 |
import requests
|
9 |
|
@@ -83,6 +84,10 @@ FILE_TYPES: dict[str, FileType] = {
|
|
83 |
]
|
84 |
}
|
85 |
|
|
|
|
|
|
|
|
|
86 |
|
87 |
def name_to_code(file_type_name: str) -> str | None:
|
88 |
"""Maps names to the encoding format: Text => plain/text."""
|
@@ -127,29 +132,34 @@ def download_file(url: str, file_path: epath.Path):
|
|
127 |
def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
|
128 |
"""Gets the df associated to the file."""
|
129 |
if file_type == FileTypes.CSV:
|
130 |
-
|
131 |
elif file_type == FileTypes.EXCEL:
|
132 |
-
|
133 |
elif file_type == FileTypes.JSON:
|
134 |
-
|
135 |
elif file_type == FileTypes.JSONL:
|
136 |
-
|
137 |
elif file_type == FileTypes.PARQUET:
|
138 |
-
|
139 |
else:
|
140 |
raise NotImplementedError()
|
|
|
141 |
|
142 |
|
143 |
-
def
|
144 |
-
|
145 |
-
)
|
|
|
|
|
|
|
146 |
"""Downloads locally and extracts the file information."""
|
147 |
file_path = hash_file_path(url)
|
148 |
if not file_path.exists():
|
149 |
download_file(url, file_path)
|
150 |
with file_path.open("rb") as file:
|
151 |
sha256 = _sha256(file.read())
|
152 |
-
|
|
|
153 |
return FileObject(
|
154 |
name=find_unique_name(names, url.split("/")[-1]),
|
155 |
description="",
|
@@ -162,15 +172,17 @@ def file_from_url(
|
|
162 |
|
163 |
|
164 |
def file_from_upload(
|
165 |
-
|
166 |
) -> FileObject:
|
167 |
"""Uploads locally and extracts the file information."""
|
168 |
value = file.getvalue()
|
169 |
content_url = f"data/{file.name}"
|
170 |
sha256 = _sha256(value)
|
171 |
-
|
|
|
172 |
f.write(value)
|
173 |
-
|
|
|
174 |
return FileObject(
|
175 |
name=find_unique_name(names, file.name),
|
176 |
description="",
|
@@ -192,3 +204,19 @@ def file_from_form(
|
|
192 |
return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
|
193 |
else:
|
194 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import tempfile
|
5 |
|
6 |
from etils import epath
|
7 |
+
import magic
|
8 |
import pandas as pd
|
9 |
import requests
|
10 |
|
|
|
84 |
]
|
85 |
}
|
86 |
|
87 |
+
ENCODING_FORMATS: dict[str, FileType] = {
|
88 |
+
file_type.encoding_format: file_type for file_type in FILE_TYPES.values()
|
89 |
+
}
|
90 |
+
|
91 |
|
92 |
def name_to_code(file_type_name: str) -> str | None:
|
93 |
"""Maps names to the encoding format: Text => plain/text."""
|
|
|
132 |
def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
|
133 |
"""Gets the df associated to the file."""
|
134 |
if file_type == FileTypes.CSV:
|
135 |
+
df = pd.read_csv(file)
|
136 |
elif file_type == FileTypes.EXCEL:
|
137 |
+
df = pd.read_excel(file)
|
138 |
elif file_type == FileTypes.JSON:
|
139 |
+
df = pd.read_json(file)
|
140 |
elif file_type == FileTypes.JSONL:
|
141 |
+
df = pd.read_json(file, lines=True)
|
142 |
elif file_type == FileTypes.PARQUET:
|
143 |
+
df = pd.read_parquet(file)
|
144 |
else:
|
145 |
raise NotImplementedError()
|
146 |
+
return df.infer_objects()
|
147 |
|
148 |
|
149 |
+
def guess_file_type(path: epath.Path) -> FileType | None:
|
150 |
+
mime = magic.from_file(path, mime=True)
|
151 |
+
return ENCODING_FORMATS.get(mime)
|
152 |
+
|
153 |
+
|
154 |
+
def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
|
155 |
"""Downloads locally and extracts the file information."""
|
156 |
file_path = hash_file_path(url)
|
157 |
if not file_path.exists():
|
158 |
download_file(url, file_path)
|
159 |
with file_path.open("rb") as file:
|
160 |
sha256 = _sha256(file.read())
|
161 |
+
file_type = guess_file_type(file_path)
|
162 |
+
df = get_dataframe(file_type, file_path)
|
163 |
return FileObject(
|
164 |
name=find_unique_name(names, url.split("/")[-1]),
|
165 |
description="",
|
|
|
172 |
|
173 |
|
174 |
def file_from_upload(
|
175 |
+
file: io.BytesIO, names: set[str], folder: epath.Path
|
176 |
) -> FileObject:
|
177 |
"""Uploads locally and extracts the file information."""
|
178 |
value = file.getvalue()
|
179 |
content_url = f"data/{file.name}"
|
180 |
sha256 = _sha256(value)
|
181 |
+
file_path = get_resource_path(content_url)
|
182 |
+
with file_path.open("wb") as f:
|
183 |
f.write(value)
|
184 |
+
file_type = guess_file_type(file_path)
|
185 |
+
df = get_dataframe(file_type, file)
|
186 |
return FileObject(
|
187 |
name=find_unique_name(names, file.name),
|
188 |
description="",
|
|
|
204 |
return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
|
205 |
else:
|
206 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
207 |
+
|
208 |
+
|
209 |
+
def is_url(file: FileObject) -> bool:
|
210 |
+
return file.content_url and file.content_url.startswith("http")
|
211 |
+
|
212 |
+
|
213 |
+
def trigger_download(file: FileObject):
|
214 |
+
if is_url(file):
|
215 |
+
file_path = hash_file_path(file.content_url)
|
216 |
+
if not file_path.exists():
|
217 |
+
download_file(file.content_url, file_path)
|
218 |
+
else:
|
219 |
+
file_path = get_resource_path(file.content_url)
|
220 |
+
file_type = guess_file_type(file_path)
|
221 |
+
df = get_dataframe(file_type, file_path)
|
222 |
+
file.df = df
|
core/files_test.py
CHANGED
@@ -1,12 +1,17 @@
|
|
|
|
|
|
1 |
from etils import epath
|
2 |
import pandas as pd
|
3 |
import pytest
|
4 |
|
5 |
-
from
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
-
|
|
|
|
|
10 |
csv = epath.Path(
|
11 |
# This is the hash path for "https://my.url".
|
12 |
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
|
@@ -18,10 +23,14 @@ def test_check_file_csv():
|
|
18 |
f.write("a,1\n")
|
19 |
f.write("b,2\n")
|
20 |
f.write("c,3\n")
|
21 |
-
file = file_from_url(
|
22 |
pd.testing.assert_frame_equal(
|
23 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
24 |
)
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
with pytest.raises(NotImplementedError):
|
27 |
-
file_from_url("
|
|
|
1 |
+
from unittest import mock
|
2 |
+
|
3 |
from etils import epath
|
4 |
import pandas as pd
|
5 |
import pytest
|
6 |
|
7 |
+
from core import files as files_module
|
8 |
+
|
9 |
+
FileTypes = files_module.FileTypes
|
10 |
|
11 |
|
12 |
+
@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
|
13 |
+
def test_check_file_csv(guess_file_type):
|
14 |
+
del guess_file_type
|
15 |
csv = epath.Path(
|
16 |
# This is the hash path for "https://my.url".
|
17 |
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
|
|
|
23 |
f.write("a,1\n")
|
24 |
f.write("b,2\n")
|
25 |
f.write("c,3\n")
|
26 |
+
file = files_module.file_from_url("https://my.url", set(), epath.Path())
|
27 |
pd.testing.assert_frame_equal(
|
28 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
29 |
)
|
30 |
+
|
31 |
+
|
32 |
+
@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
|
33 |
+
def test_check_file_unknown(guess_file_type):
|
34 |
+
del guess_file_type
|
35 |
with pytest.raises(NotImplementedError):
|
36 |
+
files_module.file_from_url("https://my.url", set(), epath.Path())
|
core/past_projects.py
CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
|
|
7 |
from core.constants import PAST_PROJECTS_PATH
|
8 |
from core.query_params import set_project
|
9 |
from core.state import CurrentProject
|
|
|
10 |
from core.state import get_cached_user
|
11 |
from core.state import Metadata
|
12 |
|
@@ -23,13 +24,17 @@ def _pickle_file(path: epath.Path) -> epath.Path:
|
|
23 |
|
24 |
|
25 |
def save_current_project():
|
26 |
-
metadata = st.session_state[Metadata]
|
27 |
project = st.session_state.get(CurrentProject)
|
28 |
if not project:
|
29 |
project = CurrentProject.create_new()
|
30 |
st.session_state[CurrentProject] = project
|
31 |
project.path.mkdir(parents=True, exist_ok=True)
|
32 |
set_project(project)
|
|
|
|
|
|
|
|
|
33 |
try:
|
34 |
pickled = pickle.dumps(metadata)
|
35 |
_pickle_file(project.path).write_bytes(pickled)
|
|
|
7 |
from core.constants import PAST_PROJECTS_PATH
|
8 |
from core.query_params import set_project
|
9 |
from core.state import CurrentProject
|
10 |
+
from core.state import FileObject
|
11 |
from core.state import get_cached_user
|
12 |
from core.state import Metadata
|
13 |
|
|
|
24 |
|
25 |
|
26 |
def save_current_project():
|
27 |
+
metadata: Metadata = st.session_state[Metadata]
|
28 |
project = st.session_state.get(CurrentProject)
|
29 |
if not project:
|
30 |
project = CurrentProject.create_new()
|
31 |
st.session_state[CurrentProject] = project
|
32 |
project.path.mkdir(parents=True, exist_ok=True)
|
33 |
set_project(project)
|
34 |
+
# FileObjects should have a folder.
|
35 |
+
for resource in metadata.distribution:
|
36 |
+
if isinstance(resource, FileObject):
|
37 |
+
resource.folder = project.path
|
38 |
try:
|
39 |
pickled = pickle.dumps(metadata)
|
40 |
_pickle_file(project.path).write_bytes(pickled)
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ mlcroissant
|
|
3 |
numpy
|
4 |
pandas
|
5 |
pytest
|
|
|
6 |
rdflib
|
7 |
requests
|
8 |
streamlit
|
|
|
3 |
numpy
|
4 |
pandas
|
5 |
pytest
|
6 |
+
python-magic
|
7 |
rdflib
|
8 |
requests
|
9 |
streamlit
|
views/files.py
CHANGED
@@ -11,7 +11,9 @@ from core.files import file_from_url
|
|
11 |
from core.files import FILE_OBJECT
|
12 |
from core.files import FILE_SET
|
13 |
from core.files import FILE_TYPES
|
|
|
14 |
from core.files import RESOURCE_TYPES
|
|
|
15 |
from core.path import get_resource_path
|
16 |
from core.record_sets import infer_record_sets
|
17 |
from core.state import CurrentProject
|
@@ -55,19 +57,21 @@ def _render_warnings():
|
|
55 |
metadata: Metadata = st.session_state[Metadata]
|
56 |
warning = ""
|
57 |
for resource in metadata.distribution:
|
|
|
|
|
58 |
content_url = resource.content_url
|
59 |
if content_url and not content_url.startswith("http"):
|
60 |
path = get_resource_path(content_url)
|
61 |
if not path.exists():
|
62 |
if OAUTH_CLIENT_ID:
|
63 |
warning += (
|
64 |
-
f'⚠️ Resource "{resource.name}" points to a local file
|
65 |
" doesn't exist on the disk. Fix this by changing the content"
|
66 |
" URL.\n\n"
|
67 |
)
|
68 |
else:
|
69 |
warning += (
|
70 |
-
f'⚠️ Resource "{resource.name}" points to a local file
|
71 |
" doesn't exist on the disk. Fix this by either downloading"
|
72 |
f" it to {path} or changing the content URL.\n\n"
|
73 |
)
|
@@ -107,7 +111,6 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
|
|
107 |
def _render_upload_panel():
|
108 |
"""Renders the form to upload from local or upload from URL."""
|
109 |
with st.form(key="upload_form", clear_on_submit=True):
|
110 |
-
file_type_name = st.selectbox("Encoding format", options=FILE_TYPES.keys())
|
111 |
tab1, tab2, tab3 = st.tabs([
|
112 |
"Import from a local file", "Import from a URL", "Add manually"
|
113 |
])
|
@@ -124,15 +127,14 @@ def _render_upload_panel():
|
|
124 |
def handle_on_click():
|
125 |
url = st.session_state[_DISTANT_URL_KEY]
|
126 |
uploaded_file = st.session_state[_LOCAL_FILE_KEY]
|
127 |
-
file_type = FILE_TYPES[file_type_name]
|
128 |
metadata: Metadata = st.session_state[Metadata]
|
129 |
names = metadata.names()
|
130 |
project: CurrentProject = st.session_state[CurrentProject]
|
131 |
folder = project.path
|
132 |
if url:
|
133 |
-
file = file_from_url(
|
134 |
elif uploaded_file:
|
135 |
-
file = file_from_upload(
|
136 |
else:
|
137 |
resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
|
138 |
file = file_from_form(resource_type, names, folder)
|
@@ -191,7 +193,7 @@ def _render_resource_details(selected_file: Resource):
|
|
191 |
)
|
192 |
|
193 |
|
194 |
-
def _render_resource(prefix: int, file:
|
195 |
parent_options = [f.name for f in st.session_state[Metadata].distribution]
|
196 |
key = f"{prefix}_parents"
|
197 |
st.multiselect(
|
@@ -264,10 +266,7 @@ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bo
|
|
264 |
)
|
265 |
if is_file_object:
|
266 |
st.markdown("First rows of data:")
|
267 |
-
is_url = file.content_url and file.content_url.startswith("http")
|
268 |
if file.df is not None:
|
269 |
st.dataframe(file.df, height=DF_HEIGHT)
|
270 |
-
elif is_url:
|
271 |
-
st.button("Trigger download")
|
272 |
else:
|
273 |
-
st.
|
|
|
11 |
from core.files import FILE_OBJECT
|
12 |
from core.files import FILE_SET
|
13 |
from core.files import FILE_TYPES
|
14 |
+
from core.files import is_url
|
15 |
from core.files import RESOURCE_TYPES
|
16 |
+
from core.files import trigger_download
|
17 |
from core.path import get_resource_path
|
18 |
from core.record_sets import infer_record_sets
|
19 |
from core.state import CurrentProject
|
|
|
57 |
metadata: Metadata = st.session_state[Metadata]
|
58 |
warning = ""
|
59 |
for resource in metadata.distribution:
|
60 |
+
if not isinstance(resource, FileObject):
|
61 |
+
continue
|
62 |
content_url = resource.content_url
|
63 |
if content_url and not content_url.startswith("http"):
|
64 |
path = get_resource_path(content_url)
|
65 |
if not path.exists():
|
66 |
if OAUTH_CLIENT_ID:
|
67 |
warning += (
|
68 |
+
f'⚠️ Resource "{resource.name}" points to a local file that'
|
69 |
" doesn't exist on the disk. Fix this by changing the content"
|
70 |
" URL.\n\n"
|
71 |
)
|
72 |
else:
|
73 |
warning += (
|
74 |
+
f'⚠️ Resource "{resource.name}" points to a local file that'
|
75 |
" doesn't exist on the disk. Fix this by either downloading"
|
76 |
f" it to {path} or changing the content URL.\n\n"
|
77 |
)
|
|
|
111 |
def _render_upload_panel():
|
112 |
"""Renders the form to upload from local or upload from URL."""
|
113 |
with st.form(key="upload_form", clear_on_submit=True):
|
|
|
114 |
tab1, tab2, tab3 = st.tabs([
|
115 |
"Import from a local file", "Import from a URL", "Add manually"
|
116 |
])
|
|
|
127 |
def handle_on_click():
|
128 |
url = st.session_state[_DISTANT_URL_KEY]
|
129 |
uploaded_file = st.session_state[_LOCAL_FILE_KEY]
|
|
|
130 |
metadata: Metadata = st.session_state[Metadata]
|
131 |
names = metadata.names()
|
132 |
project: CurrentProject = st.session_state[CurrentProject]
|
133 |
folder = project.path
|
134 |
if url:
|
135 |
+
file = file_from_url(url, names, folder)
|
136 |
elif uploaded_file:
|
137 |
+
file = file_from_upload(uploaded_file, names, folder)
|
138 |
else:
|
139 |
resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
|
140 |
file = file_from_form(resource_type, names, folder)
|
|
|
193 |
)
|
194 |
|
195 |
|
196 |
+
def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
197 |
parent_options = [f.name for f in st.session_state[Metadata].distribution]
|
198 |
key = f"{prefix}_parents"
|
199 |
st.multiselect(
|
|
|
266 |
)
|
267 |
if is_file_object:
|
268 |
st.markdown("First rows of data:")
|
|
|
269 |
if file.df is not None:
|
270 |
st.dataframe(file.df, height=DF_HEIGHT)
|
|
|
|
|
271 |
else:
|
272 |
+
st.button("Trigger download", on_click=trigger_download, args=(file,))
|
views/record_sets.py
CHANGED
@@ -30,6 +30,8 @@ from views.source import render_source
|
|
30 |
|
31 |
_NUM_RECORDS = 3
|
32 |
_TIMEOUT_SECONDS = 1
|
|
|
|
|
33 |
|
34 |
|
35 |
class _Result(TypedDict):
|
@@ -214,6 +216,7 @@ class FieldDataFrame:
|
|
214 |
|
215 |
|
216 |
def render_record_sets():
|
|
|
217 |
col1, col2 = st.columns([1, 1])
|
218 |
with col1:
|
219 |
with st.spinner("Generating the dataset..."):
|
@@ -361,7 +364,7 @@ def _render_left_panel():
|
|
361 |
left.button(
|
362 |
"⚠️",
|
363 |
key=f"idea-{prefix}",
|
364 |
-
|
365 |
help=textwrap.dedent(f"""**Error**:
|
366 |
```
|
367 |
{exception}
|
|
|
30 |
|
31 |
_NUM_RECORDS = 3
|
32 |
_TIMEOUT_SECONDS = 1
|
33 |
+
_INFO = """RecordSets describe sets of structured records obtained from resources or
|
34 |
+
other RecordSets. You can think of RecordSets as tables with typed fields."""
|
35 |
|
36 |
|
37 |
class _Result(TypedDict):
|
|
|
216 |
|
217 |
|
218 |
def render_record_sets():
|
219 |
+
st.info(_INFO, icon="💡")
|
220 |
col1, col2 = st.columns([1, 1])
|
221 |
with col1:
|
222 |
with st.spinner("Generating the dataset..."):
|
|
|
364 |
left.button(
|
365 |
"⚠️",
|
366 |
key=f"idea-{prefix}",
|
367 |
+
on_click=lambda: _generate_data_with_timeout.clear(),
|
368 |
help=textwrap.dedent(f"""**Error**:
|
369 |
```
|
370 |
{exception}
|
views/splash.py
CHANGED
@@ -5,6 +5,7 @@ import streamlit as st
|
|
5 |
|
6 |
from core.constants import OAUTH_CLIENT_ID
|
7 |
from core.past_projects import save_current_project
|
|
|
8 |
from core.query_params import set_project
|
9 |
from core.state import CurrentProject
|
10 |
from core.state import Metadata
|
@@ -12,6 +13,16 @@ import mlcroissant as mlc
|
|
12 |
from views.load import render_load
|
13 |
from views.previous_files import render_previous_files
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def render_splash():
|
17 |
if OAUTH_CLIENT_ID:
|
@@ -39,12 +50,19 @@ def render_splash():
|
|
39 |
with st.expander("**Try out an example!**", expanded=True):
|
40 |
|
41 |
def create_example(dataset: str):
|
42 |
-
|
|
|
43 |
try:
|
44 |
json = requests.get(url).json()
|
45 |
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
|
46 |
st.session_state[Metadata] = Metadata.from_canonical(metadata)
|
47 |
save_current_project()
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
except Exception as exception:
|
49 |
logging.error(exception)
|
50 |
st.error(
|
@@ -55,15 +73,7 @@ def render_splash():
|
|
55 |
|
56 |
dataset = st.selectbox(
|
57 |
label="Dataset",
|
58 |
-
options=
|
59 |
-
"Titanic",
|
60 |
-
"FLORES-200",
|
61 |
-
"GPT-3",
|
62 |
-
"COCO2014",
|
63 |
-
"PASS",
|
64 |
-
"MovieLens",
|
65 |
-
"Bigcode-The-Stack",
|
66 |
-
],
|
67 |
)
|
68 |
st.button(
|
69 |
f"{dataset} dataset",
|
|
|
5 |
|
6 |
from core.constants import OAUTH_CLIENT_ID
|
7 |
from core.past_projects import save_current_project
|
8 |
+
from core.path import get_resource_path
|
9 |
from core.query_params import set_project
|
10 |
from core.state import CurrentProject
|
11 |
from core.state import Metadata
|
|
|
13 |
from views.load import render_load
|
14 |
from views.previous_files import render_previous_files
|
15 |
|
16 |
+
_DATASETS = {
|
17 |
+
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
|
18 |
+
"FLORES-200": [],
|
19 |
+
"GPT-3": [],
|
20 |
+
"COCO2014": [],
|
21 |
+
"PASS": [],
|
22 |
+
"MovieLens": [],
|
23 |
+
"Bigcode-The-Stack": [],
|
24 |
+
}
|
25 |
+
|
26 |
|
27 |
def render_splash():
|
28 |
if OAUTH_CLIENT_ID:
|
|
|
50 |
with st.expander("**Try out an example!**", expanded=True):
|
51 |
|
52 |
def create_example(dataset: str):
|
53 |
+
base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
|
54 |
+
url = f"{base}/metadata.json"
|
55 |
try:
|
56 |
json = requests.get(url).json()
|
57 |
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
|
58 |
st.session_state[Metadata] = Metadata.from_canonical(metadata)
|
59 |
save_current_project()
|
60 |
+
# Write supplementary files.
|
61 |
+
files = _DATASETS.get(dataset, [])
|
62 |
+
for file in files:
|
63 |
+
path = get_resource_path(file)
|
64 |
+
json = requests.get(f"{base}/{file}")
|
65 |
+
path.write_bytes(json.content)
|
66 |
except Exception as exception:
|
67 |
logging.error(exception)
|
68 |
st.error(
|
|
|
73 |
|
74 |
dataset = st.selectbox(
|
75 |
label="Dataset",
|
76 |
+
options=_DATASETS.keys(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
)
|
78 |
st.button(
|
79 |
f"{dataset} dataset",
|