Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- Dockerfile +23 -0
- core/data_types.py +27 -0
- core/data_types_test.py +8 -0
- core/files.py +61 -8
- core/files_test.py +2 -2
- core/path.py +13 -0
- core/state.py +1 -0
- events/fields.py +2 -1
- events/metadata.py +81 -1
- events/metadata_test.py +7 -0
- events/resources.py +8 -1
- utils.py +0 -11
- views/files.py +41 -10
- views/metadata.py +19 -26
- views/overview.py +0 -11
- views/record_sets.py +38 -19
Dockerfile
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/Dockerfile
|
2 |
+
|
3 |
+
FROM python:3.10-slim
|
4 |
+
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
RUN apt-get update && apt-get install -y \
|
8 |
+
build-essential \
|
9 |
+
curl \
|
10 |
+
software-properties-common \
|
11 |
+
git \
|
12 |
+
python3-pip \
|
13 |
+
&& rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
COPY ./ /app/
|
16 |
+
|
17 |
+
RUN python3 -m pip install -r requirements.txt
|
18 |
+
|
19 |
+
EXPOSE 8501
|
20 |
+
|
21 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
22 |
+
|
23 |
+
ENTRYPOINT ["streamlit", "run", "app.py"]
|
core/data_types.py
CHANGED
@@ -17,3 +17,30 @@ def convert_dtype(dtype: Any):
|
|
17 |
return mlc.DataType.TEXT
|
18 |
else:
|
19 |
raise NotImplementedError(dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
return mlc.DataType.TEXT
|
18 |
else:
|
19 |
raise NotImplementedError(dtype)
|
20 |
+
|
21 |
+
|
22 |
+
MLC_DATA_TYPES = [
|
23 |
+
mlc.DataType.TEXT,
|
24 |
+
mlc.DataType.FLOAT,
|
25 |
+
mlc.DataType.INTEGER,
|
26 |
+
mlc.DataType.BOOL,
|
27 |
+
mlc.DataType.URL,
|
28 |
+
]
|
29 |
+
|
30 |
+
STR_DATA_TYPES = [
|
31 |
+
str(data_type).replace("https://schema.org/", "") for data_type in MLC_DATA_TYPES
|
32 |
+
]
|
33 |
+
|
34 |
+
|
35 |
+
def str_to_mlc_data_type(data_type: str) -> mlc.DataType | None:
|
36 |
+
for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
|
37 |
+
if data_type == str_data_type:
|
38 |
+
return mlc_data_type
|
39 |
+
return None
|
40 |
+
|
41 |
+
|
42 |
+
def mlc_to_str_data_type(data_type: str) -> mlc.DataType | None:
|
43 |
+
for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
|
44 |
+
if data_type == mlc_data_type:
|
45 |
+
return str_data_type
|
46 |
+
return None
|
core/data_types_test.py
CHANGED
@@ -3,7 +3,10 @@
|
|
3 |
import numpy as np
|
4 |
import pytest
|
5 |
|
|
|
|
|
6 |
from .data_types import convert_dtype
|
|
|
7 |
|
8 |
|
9 |
def test_convert_dtype():
|
@@ -13,3 +16,8 @@ def test_convert_dtype():
|
|
13 |
convert_dtype(np.str_) == "https://schema.org/Text"
|
14 |
with pytest.raises(NotImplementedError):
|
15 |
convert_dtype(np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import numpy as np
|
4 |
import pytest
|
5 |
|
6 |
+
import mlcroissant as mlc
|
7 |
+
|
8 |
from .data_types import convert_dtype
|
9 |
+
from .data_types import str_to_mlc_data_type
|
10 |
|
11 |
|
12 |
def test_convert_dtype():
|
|
|
16 |
convert_dtype(np.str_) == "https://schema.org/Text"
|
17 |
with pytest.raises(NotImplementedError):
|
18 |
convert_dtype(np.float32)
|
19 |
+
|
20 |
+
|
21 |
+
def test_str_to_mlc_data_type():
|
22 |
+
assert str_to_mlc_data_type("Integer") == mlc.DataType.INTEGER
|
23 |
+
assert str_to_mlc_data_type("Foo") == None
|
core/files.py
CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8 |
import requests
|
9 |
|
10 |
from .names import find_unique_name
|
|
|
11 |
from .state import FileObject
|
12 |
from .state import FileSet
|
13 |
|
@@ -30,6 +31,8 @@ class FileTypes:
|
|
30 |
encoding_format="application/vnd.ms-excel",
|
31 |
extensions=["xls", "xlsx", "xlsm"],
|
32 |
)
|
|
|
|
|
33 |
JSON = FileType(
|
34 |
name="JSON", encoding_format="application/json", extensions=["json"]
|
35 |
)
|
@@ -43,20 +46,60 @@ class FileTypes:
|
|
43 |
encoding_format="application/vnd.apache.parquet",
|
44 |
extensions=["parquet"],
|
45 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
FILE_TYPES: dict[str, FileType] = {
|
49 |
-
file_type
|
50 |
for file_type in [
|
51 |
FileTypes.CSV,
|
52 |
FileTypes.EXCEL,
|
|
|
|
|
53 |
FileTypes.JSON,
|
54 |
FileTypes.JSONL,
|
55 |
FileTypes.PARQUET,
|
|
|
|
|
|
|
56 |
]
|
57 |
}
|
58 |
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
def _sha256(content: bytes):
|
61 |
"""Computes the sha256 digest of the byte string."""
|
62 |
return hashlib.sha256(content).hexdigest()
|
@@ -97,7 +140,9 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
|
|
97 |
raise NotImplementedError()
|
98 |
|
99 |
|
100 |
-
def file_from_url(
|
|
|
|
|
101 |
"""Downloads locally and extracts the file information."""
|
102 |
file_path = hash_file_path(url)
|
103 |
if not file_path.exists():
|
@@ -112,30 +157,38 @@ def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
|
|
112 |
encoding_format=file_type.encoding_format,
|
113 |
sha256=sha256,
|
114 |
df=df,
|
|
|
115 |
)
|
116 |
|
117 |
|
118 |
def file_from_upload(
|
119 |
-
file_type: FileType, file: io.BytesIO, names: set[str]
|
120 |
) -> FileObject:
|
121 |
"""Uploads locally and extracts the file information."""
|
122 |
-
|
|
|
|
|
|
|
|
|
123 |
df = get_dataframe(file_type, file).infer_objects()
|
124 |
return FileObject(
|
125 |
name=find_unique_name(names, file.name),
|
126 |
description="",
|
127 |
-
content_url=
|
128 |
encoding_format=file_type.encoding_format,
|
129 |
sha256=sha256,
|
130 |
df=df,
|
|
|
131 |
)
|
132 |
|
133 |
|
134 |
-
def file_from_form(
|
|
|
|
|
135 |
"""Creates a file based on manually added fields."""
|
136 |
if type == FILE_OBJECT:
|
137 |
-
return FileObject(name=find_unique_name(names, "file_object"))
|
138 |
elif type == FILE_SET:
|
139 |
-
return FileSet(name=find_unique_name(names, "file_set"))
|
140 |
else:
|
141 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
|
|
8 |
import requests
|
9 |
|
10 |
from .names import find_unique_name
|
11 |
+
from .path import get_resource_path
|
12 |
from .state import FileObject
|
13 |
from .state import FileSet
|
14 |
|
|
|
31 |
encoding_format="application/vnd.ms-excel",
|
32 |
extensions=["xls", "xlsx", "xlsm"],
|
33 |
)
|
34 |
+
GZIP = FileType(name="GZIP", encoding_format="application/gzip", extensions=["gz"])
|
35 |
+
JPEG = FileType(name="JPEG", encoding_format="image/jpeg", extensions=["json"])
|
36 |
JSON = FileType(
|
37 |
name="JSON", encoding_format="application/json", extensions=["json"]
|
38 |
)
|
|
|
46 |
encoding_format="application/vnd.apache.parquet",
|
47 |
extensions=["parquet"],
|
48 |
)
|
49 |
+
TAR = FileType(
|
50 |
+
name="Archive (TAR)",
|
51 |
+
encoding_format="application/x-tar",
|
52 |
+
extensions=["tar"],
|
53 |
+
)
|
54 |
+
TXT = FileType(
|
55 |
+
name="Text",
|
56 |
+
encoding_format="plain/text",
|
57 |
+
extensions=["txt"],
|
58 |
+
)
|
59 |
+
ZIP = FileType(
|
60 |
+
name="ZIP",
|
61 |
+
encoding_format="application/zip",
|
62 |
+
extensions=["zip"],
|
63 |
+
)
|
64 |
+
|
65 |
+
|
66 |
+
def _full_name(file_type: FileType):
|
67 |
+
return f"{file_type.name} ({file_type.encoding_format})"
|
68 |
|
69 |
|
70 |
FILE_TYPES: dict[str, FileType] = {
|
71 |
+
_full_name(file_type): file_type
|
72 |
for file_type in [
|
73 |
FileTypes.CSV,
|
74 |
FileTypes.EXCEL,
|
75 |
+
FileTypes.GZIP,
|
76 |
+
FileTypes.JPEG,
|
77 |
FileTypes.JSON,
|
78 |
FileTypes.JSONL,
|
79 |
FileTypes.PARQUET,
|
80 |
+
FileTypes.TAR,
|
81 |
+
FileTypes.TXT,
|
82 |
+
FileTypes.ZIP,
|
83 |
]
|
84 |
}
|
85 |
|
86 |
|
87 |
+
def name_to_code(file_type_name: str) -> str | None:
|
88 |
+
"""Maps names to the encoding format: Text => plain/text."""
|
89 |
+
for name, file_type in FILE_TYPES.items():
|
90 |
+
if file_type_name == name:
|
91 |
+
return file_type.encoding_format
|
92 |
+
return None
|
93 |
+
|
94 |
+
|
95 |
+
def code_to_index(encoding_format: str) -> int | None:
|
96 |
+
"""Maps the encoding format to its index in the list of keys: plain/text => 12."""
|
97 |
+
for i, file_type in enumerate(FILE_TYPES.values()):
|
98 |
+
if file_type.encoding_format == encoding_format:
|
99 |
+
return i
|
100 |
+
return None
|
101 |
+
|
102 |
+
|
103 |
def _sha256(content: bytes):
|
104 |
"""Computes the sha256 digest of the byte string."""
|
105 |
return hashlib.sha256(content).hexdigest()
|
|
|
140 |
raise NotImplementedError()
|
141 |
|
142 |
|
143 |
+
def file_from_url(
|
144 |
+
file_type: FileType, url: str, names: set[str], folder: epath.Path
|
145 |
+
) -> FileObject:
|
146 |
"""Downloads locally and extracts the file information."""
|
147 |
file_path = hash_file_path(url)
|
148 |
if not file_path.exists():
|
|
|
157 |
encoding_format=file_type.encoding_format,
|
158 |
sha256=sha256,
|
159 |
df=df,
|
160 |
+
folder=folder,
|
161 |
)
|
162 |
|
163 |
|
164 |
def file_from_upload(
|
165 |
+
file_type: FileType, file: io.BytesIO, names: set[str], folder: epath.Path
|
166 |
) -> FileObject:
|
167 |
"""Uploads locally and extracts the file information."""
|
168 |
+
value = file.getvalue()
|
169 |
+
content_url = f"data/{file.name}"
|
170 |
+
sha256 = _sha256(value)
|
171 |
+
with get_resource_path(content_url).open("wb") as f:
|
172 |
+
f.write(value)
|
173 |
df = get_dataframe(file_type, file).infer_objects()
|
174 |
return FileObject(
|
175 |
name=find_unique_name(names, file.name),
|
176 |
description="",
|
177 |
+
content_url=content_url,
|
178 |
encoding_format=file_type.encoding_format,
|
179 |
sha256=sha256,
|
180 |
df=df,
|
181 |
+
folder=folder,
|
182 |
)
|
183 |
|
184 |
|
185 |
+
def file_from_form(
|
186 |
+
type: str, names: set[str], folder: epath.Path
|
187 |
+
) -> FileObject | FileSet:
|
188 |
"""Creates a file based on manually added fields."""
|
189 |
if type == FILE_OBJECT:
|
190 |
+
return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
|
191 |
elif type == FILE_SET:
|
192 |
+
return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
|
193 |
else:
|
194 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
core/files_test.py
CHANGED
@@ -18,10 +18,10 @@ def test_check_file_csv():
|
|
18 |
f.write("a,1\n")
|
19 |
f.write("b,2\n")
|
20 |
f.write("c,3\n")
|
21 |
-
file = file_from_url(FileTypes.CSV, "https://my.url", set())
|
22 |
pd.testing.assert_frame_equal(
|
23 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
24 |
)
|
25 |
# Fails with unknown encoding_format:
|
26 |
with pytest.raises(NotImplementedError):
|
27 |
-
file_from_url("unknown", "https://my.url", set())
|
|
|
18 |
f.write("a,1\n")
|
19 |
f.write("b,2\n")
|
20 |
f.write("c,3\n")
|
21 |
+
file = file_from_url(FileTypes.CSV, "https://my.url", set(), epath.Path())
|
22 |
pd.testing.assert_frame_equal(
|
23 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
24 |
)
|
25 |
# Fails with unknown encoding_format:
|
26 |
with pytest.raises(NotImplementedError):
|
27 |
+
file_from_url("unknown", "https://my.url", set(), epath.Path())
|
core/path.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from etils import epath
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
from core.state import CurrentProject
|
5 |
+
|
6 |
+
|
7 |
+
def get_resource_path(content_url: str) -> epath.Path:
|
8 |
+
"""Gets the path on disk of the resource with `content_url`."""
|
9 |
+
project: CurrentProject = st.session_state[CurrentProject]
|
10 |
+
path = project.path / content_url
|
11 |
+
if not path.parent.exists():
|
12 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
13 |
+
return path
|
core/state.py
CHANGED
@@ -137,6 +137,7 @@ class FileObject:
|
|
137 |
sha256: str | None = None
|
138 |
df: pd.DataFrame | None = None
|
139 |
rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
|
|
|
140 |
|
141 |
|
142 |
@dataclasses.dataclass
|
|
|
137 |
sha256: str | None = None
|
138 |
df: pd.DataFrame | None = None
|
139 |
rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
|
140 |
+
folder: epath.PathLike | None = None
|
141 |
|
142 |
|
143 |
@dataclasses.dataclass
|
events/fields.py
CHANGED
@@ -3,6 +3,7 @@ from typing import Any
|
|
3 |
|
4 |
import streamlit as st
|
5 |
|
|
|
6 |
from core.state import Field
|
7 |
from core.state import Metadata
|
8 |
import mlcroissant as mlc
|
@@ -88,7 +89,7 @@ def handle_field_change(
|
|
88 |
elif change == FieldEvent.DESCRIPTION:
|
89 |
field.description = value
|
90 |
elif change == FieldEvent.DATA_TYPE:
|
91 |
-
field.data_types = [value]
|
92 |
elif change == FieldEvent.SOURCE:
|
93 |
node_type = "field" if "/" in value else "distribution"
|
94 |
source = mlc.Source(uid=value, node_type=node_type)
|
|
|
3 |
|
4 |
import streamlit as st
|
5 |
|
6 |
+
from core.data_types import str_to_mlc_data_type
|
7 |
from core.state import Field
|
8 |
from core.state import Metadata
|
9 |
import mlcroissant as mlc
|
|
|
89 |
elif change == FieldEvent.DESCRIPTION:
|
90 |
field.description = value
|
91 |
elif change == FieldEvent.DATA_TYPE:
|
92 |
+
field.data_types = [str_to_mlc_data_type(value)]
|
93 |
elif change == FieldEvent.SOURCE:
|
94 |
node_type = "field" if "/" in value else "distribution"
|
95 |
source = mlc.Source(uid=value, node_type=node_type)
|
events/metadata.py
CHANGED
@@ -4,6 +4,86 @@ import streamlit as st
|
|
4 |
|
5 |
from core.state import Metadata
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class MetadataEvent(enum.Enum):
|
9 |
"""Event that triggers a metadata change."""
|
@@ -21,7 +101,7 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
|
21 |
elif event == MetadataEvent.DESCRIPTION:
|
22 |
metadata.description = st.session_state[key]
|
23 |
elif event == MetadataEvent.LICENSE:
|
24 |
-
metadata.license = st.session_state[key]
|
25 |
elif event == MetadataEvent.CITATION:
|
26 |
metadata.citation = st.session_state[key]
|
27 |
elif event == MetadataEvent.URL:
|
|
|
4 |
|
5 |
from core.state import Metadata
|
6 |
|
7 |
+
# List from:
|
8 |
+
LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses"
|
9 |
+
LICENSES = {
|
10 |
+
"Unknown": "unknown",
|
11 |
+
"Other": "other",
|
12 |
+
"Apache license 2.0": "apache-2.0",
|
13 |
+
"MIT": "mit",
|
14 |
+
"OpenRAIL license family": "openrail",
|
15 |
+
"BigScience OpenRAIL-M": "bigscience-openrail-m",
|
16 |
+
"CreativeML OpenRAIL-M": "creativeml-openrail-m",
|
17 |
+
"BigScience BLOOM RAIL 1.0": "bigscience-bloom-rail-1.0",
|
18 |
+
"BigCode Open RAIL-M v1": "bigcode-openrail-m",
|
19 |
+
"Academic Free License v3.0": "afl-3.0",
|
20 |
+
"Artistic license 2.0": "artistic-2.0",
|
21 |
+
"Boost Software License 1.0": "bsl-1.0",
|
22 |
+
"BSD license family": "bsd",
|
23 |
+
"BSD 2-clause “Simplified” license": "bsd-2-clause",
|
24 |
+
"BSD 3-clause “New” or “Revised” license": "bsd-3-clause",
|
25 |
+
"BSD 3-clause Clear license": "bsd-3-clause-clear",
|
26 |
+
"Computational Use of Data Agreement": "c-uda",
|
27 |
+
"Creative Commons license family": "cc",
|
28 |
+
"Creative Commons Zero v1.0 Universal": "cc0-1.0",
|
29 |
+
"Creative Commons Attribution 2.0": "cc-by-2.0",
|
30 |
+
"Creative Commons Attribution 2.5": "cc-by-2.5",
|
31 |
+
"Creative Commons Attribution 3.0": "cc-by-3.0",
|
32 |
+
"Creative Commons Attribution 4.0": "cc-by-4.0",
|
33 |
+
"Creative Commons Attribution Share Alike 3.0": "cc-by-sa-3.0",
|
34 |
+
"Creative Commons Attribution Share Alike 4.0": "cc-by-sa-4.0",
|
35 |
+
"Creative Commons Attribution Non Commercial 2.0": "cc-by-nc-2.0",
|
36 |
+
"Creative Commons Attribution Non Commercial 3.0": "cc-by-nc-3.0",
|
37 |
+
"Creative Commons Attribution Non Commercial 4.0": "cc-by-nc-4.0",
|
38 |
+
"Creative Commons Attribution No Derivatives 4.0": "cc-by-nd-4.0",
|
39 |
+
"Creative Commons Attribution Non Commercial No Derivatives 3.0": "cc-by-nc-nd-3.0",
|
40 |
+
"Creative Commons Attribution Non Commercial No Derivatives 4.0": "cc-by-nc-nd-4.0",
|
41 |
+
"Creative Commons Attribution Non Commercial Share Alike 2.0": "cc-by-nc-sa-2.0",
|
42 |
+
"Creative Commons Attribution Non Commercial Share Alike 3.0": "cc-by-nc-sa-3.0",
|
43 |
+
"Creative Commons Attribution Non Commercial Share Alike 4.0": "cc-by-nc-sa-4.0",
|
44 |
+
"Community Data License Agreement – Sharing, Version 1.0": "cdla-sharing-1.0",
|
45 |
+
"Community Data License Agreement – Permissive, Version 1.0": "cdla-permissive-1.0",
|
46 |
+
"Community Data License Agreement – Permissive, Version 2.0": "cdla-permissive-2.0",
|
47 |
+
"Do What The F*ck You Want To Public License": "wtfpl",
|
48 |
+
"Educational Community License v2.0": "ecl-2.0",
|
49 |
+
"Eclipse Public License 1.0": "epl-1.0",
|
50 |
+
"Eclipse Public License 2.0": "epl-2.0",
|
51 |
+
"European Union Public License 1.1": "eupl-1.1",
|
52 |
+
"GNU Affero General Public License v3.0": "agpl-3.0",
|
53 |
+
"GNU Free Documentation License family": "gfdl",
|
54 |
+
"GNU General Public License family": "gpl",
|
55 |
+
"GNU General Public License v2.0": "gpl-2.0",
|
56 |
+
"GNU General Public License v3.0": "gpl-3.0",
|
57 |
+
"GNU Lesser General Public License family": "lgpl",
|
58 |
+
"GNU Lesser General Public License v2.1": "lgpl-2.1",
|
59 |
+
"GNU Lesser General Public License v3.0": "lgpl-3.0",
|
60 |
+
"ISC": "isc",
|
61 |
+
"LaTeX Project Public License v1.3c": "lppl-1.3c",
|
62 |
+
"Microsoft Public License": "ms-pl",
|
63 |
+
"Mozilla Public License 2.0": "mpl-2.0",
|
64 |
+
"Open Data Commons License Attribution family": "odc-by",
|
65 |
+
"Open Database License family": "odbl",
|
66 |
+
"Open Rail++-M License": "openrail++",
|
67 |
+
"Open Software License 3.0": "osl-3.0",
|
68 |
+
"PostgreSQL License": "postgresql",
|
69 |
+
"SIL Open Font License 1.1": "ofl-1.1",
|
70 |
+
"University of Illinois/NCSA Open Source License": "ncsa",
|
71 |
+
"The Unlicense": "unlicense",
|
72 |
+
"zLib License": "zlib",
|
73 |
+
"Open Data Commons Public Domain Dedication and License": "pddl",
|
74 |
+
"Lesser General Public License For Linguistic Resources": "lgpl-lr",
|
75 |
+
"DeepFloyd IF Research License Agreement": "deepfloyd-if-license",
|
76 |
+
"Llama 2 Community License Agreement": "llama2",
|
77 |
+
}
|
78 |
+
|
79 |
+
|
80 |
+
def find_license_index(code: str) -> int | None:
|
81 |
+
"""Finds the index in the list of LICENSES."""
|
82 |
+
for index, license_code in enumerate(LICENSES.values()):
|
83 |
+
if license_code == code:
|
84 |
+
return index
|
85 |
+
return None
|
86 |
+
|
87 |
|
88 |
class MetadataEvent(enum.Enum):
|
89 |
"""Event that triggers a metadata change."""
|
|
|
101 |
elif event == MetadataEvent.DESCRIPTION:
|
102 |
metadata.description = st.session_state[key]
|
103 |
elif event == MetadataEvent.LICENSE:
|
104 |
+
metadata.license = LICENSES.get(st.session_state[key])
|
105 |
elif event == MetadataEvent.CITATION:
|
106 |
metadata.citation = st.session_state[key]
|
107 |
elif event == MetadataEvent.URL:
|
events/metadata_test.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .metadata import find_license_index
|
2 |
+
|
3 |
+
|
4 |
+
def test_find_license_index():
|
5 |
+
assert find_license_index("unknown") == 0
|
6 |
+
assert find_license_index("llama2") == 66
|
7 |
+
assert find_license_index("fooo") is None
|
events/resources.py
CHANGED
@@ -4,6 +4,8 @@ import enum
|
|
4 |
import streamlit as st
|
5 |
|
6 |
from core.files import FILE_OBJECT
|
|
|
|
|
7 |
from core.state import FileObject
|
8 |
from core.state import FileSet
|
9 |
from core.state import Metadata
|
@@ -37,7 +39,7 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
|
|
37 |
elif event == ResourceEvent.DESCRIPTION:
|
38 |
resource.description = value
|
39 |
elif event == ResourceEvent.ENCODING_FORMAT:
|
40 |
-
resource.encoding_format = value
|
41 |
elif event == ResourceEvent.INCLUDES:
|
42 |
resource.includes = value
|
43 |
elif event == ResourceEvent.SHA256:
|
@@ -47,6 +49,11 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
|
|
47 |
elif event == ResourceEvent.CONTENT_SIZE:
|
48 |
resource.content_size = value
|
49 |
elif event == ResourceEvent.CONTENT_URL:
|
|
|
|
|
|
|
|
|
|
|
50 |
resource.content_url = value
|
51 |
elif event == ResourceEvent.TYPE:
|
52 |
metadata: Metadata = st.session_state[Metadata]
|
|
|
4 |
import streamlit as st
|
5 |
|
6 |
from core.files import FILE_OBJECT
|
7 |
+
from core.files import name_to_code
|
8 |
+
from core.path import get_resource_path
|
9 |
from core.state import FileObject
|
10 |
from core.state import FileSet
|
11 |
from core.state import Metadata
|
|
|
39 |
elif event == ResourceEvent.DESCRIPTION:
|
40 |
resource.description = value
|
41 |
elif event == ResourceEvent.ENCODING_FORMAT:
|
42 |
+
resource.encoding_format = name_to_code(value)
|
43 |
elif event == ResourceEvent.INCLUDES:
|
44 |
resource.includes = value
|
45 |
elif event == ResourceEvent.SHA256:
|
|
|
49 |
elif event == ResourceEvent.CONTENT_SIZE:
|
50 |
resource.content_size = value
|
51 |
elif event == ResourceEvent.CONTENT_URL:
|
52 |
+
if resource.content_url and value:
|
53 |
+
old_path = get_resource_path(resource.content_url)
|
54 |
+
new_path = get_resource_path(value)
|
55 |
+
if old_path.exists() and not new_path.exists():
|
56 |
+
old_path.rename(new_path)
|
57 |
resource.content_url = value
|
58 |
elif event == ResourceEvent.TYPE:
|
59 |
metadata: Metadata = st.session_state[Metadata]
|
utils.py
CHANGED
@@ -41,14 +41,3 @@ def init_state(force=False):
|
|
41 |
|
42 |
if OpenTab not in st.session_state or force:
|
43 |
st.session_state[OpenTab] = None
|
44 |
-
|
45 |
-
# Uncomment those lines if you work locally in order to avoid clicks at each reload.
|
46 |
-
# And comment all previous lines in `init_state`.
|
47 |
-
# if mlc.Dataset not in st.session_state or force:
|
48 |
-
# st.session_state[mlc.Dataset] = mlc.Dataset("../datasets/titanic/metadata.json")
|
49 |
-
# if Metadata not in st.session_state or force:
|
50 |
-
# st.session_state[Metadata] = Metadata.from_canonical(
|
51 |
-
# st.session_state[mlc.Dataset].metadata
|
52 |
-
# )
|
53 |
-
# if CurrentProject not in st.session_state or force:
|
54 |
-
# st.session_state[CurrentProject] = CurrentProject.create_new()
|
|
|
41 |
|
42 |
if OpenTab not in st.session_state or force:
|
43 |
st.session_state[OpenTab] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
views/files.py
CHANGED
@@ -1,7 +1,10 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
from components.tree import render_tree
|
4 |
from core.constants import DF_HEIGHT
|
|
|
|
|
5 |
from core.files import file_from_form
|
6 |
from core.files import file_from_upload
|
7 |
from core.files import file_from_url
|
@@ -9,7 +12,9 @@ from core.files import FILE_OBJECT
|
|
9 |
from core.files import FILE_SET
|
10 |
from core.files import FILE_TYPES
|
11 |
from core.files import RESOURCE_TYPES
|
|
|
12 |
from core.record_sets import infer_record_sets
|
|
|
13 |
from core.state import FileObject
|
14 |
from core.state import FileSet
|
15 |
from core.state import Metadata
|
@@ -23,10 +28,6 @@ Resource = FileObject | FileSet
|
|
23 |
_DISTANT_URL_KEY = "import_from_url"
|
24 |
_LOCAL_FILE_KEY = "import_from_local_file"
|
25 |
_MANUAL_RESOURCE_TYPE_KEY = "create_manually_type"
|
26 |
-
_MANUAL_NAME_KEY = "manual_object_name"
|
27 |
-
_MANUAL_DESCRIPTION_KEY = "manual_object_description"
|
28 |
-
_MANUAL_SHA256_KEY = "manual_object_sha256"
|
29 |
-
_MANUAL_PARENT_KEY = "manual_object_parents"
|
30 |
|
31 |
_INFO = """Resources can be `FileObjects` (single files) or `FileSets` (sets of files
|
32 |
with the same MIME type). On this page, you can upload `FileObjects`, point to external
|
@@ -34,6 +35,8 @@ resources on the web or manually create new resources."""
|
|
34 |
|
35 |
|
36 |
def render_files():
|
|
|
|
|
37 |
col1, col2, col3 = st.columns([1, 1, 1], gap="small")
|
38 |
with col1:
|
39 |
st.markdown("##### Upload more resources")
|
@@ -47,6 +50,31 @@ def render_files():
|
|
47 |
_render_right_panel()
|
48 |
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def _render_resources_panel(files: list[Resource]) -> Resource | None:
|
51 |
"""Renders the left panel: the list of all resources."""
|
52 |
filename_to_file: dict[str, list[Resource]] = {}
|
@@ -99,13 +127,15 @@ def _render_upload_panel():
|
|
99 |
file_type = FILE_TYPES[file_type_name]
|
100 |
metadata: Metadata = st.session_state[Metadata]
|
101 |
names = metadata.names()
|
|
|
|
|
102 |
if url:
|
103 |
-
file = file_from_url(file_type, url, names)
|
104 |
elif uploaded_file:
|
105 |
-
file = file_from_upload(file_type, uploaded_file, names)
|
106 |
else:
|
107 |
resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
|
108 |
-
file = file_from_form(resource_type, names)
|
109 |
|
110 |
st.session_state[Metadata].add_distribution(file)
|
111 |
record_sets = infer_record_sets(file, names)
|
@@ -157,7 +187,7 @@ def _render_resource_details(selected_file: Resource):
|
|
157 |
col1, col2 = st.columns([1, 1])
|
158 |
col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
|
159 |
col2.button(
|
160 |
-
"Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
|
161 |
)
|
162 |
|
163 |
|
@@ -224,9 +254,10 @@ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bo
|
|
224 |
args=(ResourceEvent.INCLUDES, file, key),
|
225 |
)
|
226 |
key = f"{prefix}_encoding"
|
227 |
-
st.
|
228 |
needed_field("Encoding format"),
|
229 |
-
|
|
|
230 |
key=key,
|
231 |
on_change=handle_resource_change,
|
232 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
|
|
1 |
+
from etils import epath
|
2 |
import streamlit as st
|
3 |
|
4 |
from components.tree import render_tree
|
5 |
from core.constants import DF_HEIGHT
|
6 |
+
from core.constants import OAUTH_CLIENT_ID
|
7 |
+
from core.files import code_to_index
|
8 |
from core.files import file_from_form
|
9 |
from core.files import file_from_upload
|
10 |
from core.files import file_from_url
|
|
|
12 |
from core.files import FILE_SET
|
13 |
from core.files import FILE_TYPES
|
14 |
from core.files import RESOURCE_TYPES
|
15 |
+
from core.path import get_resource_path
|
16 |
from core.record_sets import infer_record_sets
|
17 |
+
from core.state import CurrentProject
|
18 |
from core.state import FileObject
|
19 |
from core.state import FileSet
|
20 |
from core.state import Metadata
|
|
|
28 |
_DISTANT_URL_KEY = "import_from_url"
|
29 |
_LOCAL_FILE_KEY = "import_from_local_file"
|
30 |
_MANUAL_RESOURCE_TYPE_KEY = "create_manually_type"
|
|
|
|
|
|
|
|
|
31 |
|
32 |
_INFO = """Resources can be `FileObjects` (single files) or `FileSets` (sets of files
|
33 |
with the same MIME type). On this page, you can upload `FileObjects`, point to external
|
|
|
35 |
|
36 |
|
37 |
def render_files():
|
38 |
+
"""Renders the views of the files: warnings and panels to display information."""
|
39 |
+
_render_warnings()
|
40 |
col1, col2, col3 = st.columns([1, 1, 1], gap="small")
|
41 |
with col1:
|
42 |
st.markdown("##### Upload more resources")
|
|
|
50 |
_render_right_panel()
|
51 |
|
52 |
|
53 |
+
def _render_warnings():
|
54 |
+
"""Renders warnings concerning local files."""
|
55 |
+
metadata: Metadata = st.session_state[Metadata]
|
56 |
+
warning = ""
|
57 |
+
for resource in metadata.distribution:
|
58 |
+
content_url = resource.content_url
|
59 |
+
if content_url and not content_url.startswith("http"):
|
60 |
+
path = get_resource_path(content_url)
|
61 |
+
if not path.exists():
|
62 |
+
if OAUTH_CLIENT_ID:
|
63 |
+
warning += (
|
64 |
+
f'⚠️ Resource "{resource.name}" points to a local file, but'
|
65 |
+
" doesn't exist on the disk. Fix this by changing the content"
|
66 |
+
" URL.\n\n"
|
67 |
+
)
|
68 |
+
else:
|
69 |
+
warning += (
|
70 |
+
f'⚠️ Resource "{resource.name}" points to a local file, but'
|
71 |
+
" doesn't exist on the disk. Fix this by either downloading"
|
72 |
+
f" it to {path} or changing the content URL.\n\n"
|
73 |
+
)
|
74 |
+
if warning:
|
75 |
+
st.warning(warning.strip())
|
76 |
+
|
77 |
+
|
78 |
def _render_resources_panel(files: list[Resource]) -> Resource | None:
|
79 |
"""Renders the left panel: the list of all resources."""
|
80 |
filename_to_file: dict[str, list[Resource]] = {}
|
|
|
127 |
file_type = FILE_TYPES[file_type_name]
|
128 |
metadata: Metadata = st.session_state[Metadata]
|
129 |
names = metadata.names()
|
130 |
+
project: CurrentProject = st.session_state[CurrentProject]
|
131 |
+
folder = project.path
|
132 |
if url:
|
133 |
+
file = file_from_url(file_type, url, names, folder)
|
134 |
elif uploaded_file:
|
135 |
+
file = file_from_upload(file_type, uploaded_file, names, folder)
|
136 |
else:
|
137 |
resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
|
138 |
+
file = file_from_form(resource_type, names, folder)
|
139 |
|
140 |
st.session_state[Metadata].add_distribution(file)
|
141 |
record_sets = infer_record_sets(file, names)
|
|
|
187 |
col1, col2 = st.columns([1, 1])
|
188 |
col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
|
189 |
col2.button(
|
190 |
+
"⚠️ Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
|
191 |
)
|
192 |
|
193 |
|
|
|
254 |
args=(ResourceEvent.INCLUDES, file, key),
|
255 |
)
|
256 |
key = f"{prefix}_encoding"
|
257 |
+
st.selectbox(
|
258 |
needed_field("Encoding format"),
|
259 |
+
index=code_to_index(file.encoding_format),
|
260 |
+
options=FILE_TYPES.keys(),
|
261 |
key=key,
|
262 |
on_change=handle_resource_change,
|
263 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
views/metadata.py
CHANGED
@@ -1,42 +1,35 @@
|
|
1 |
-
import enum
|
2 |
-
|
3 |
import streamlit as st
|
4 |
|
5 |
from core.state import Metadata
|
|
|
6 |
from events.metadata import handle_metadata_change
|
|
|
|
|
7 |
from events.metadata import MetadataEvent
|
8 |
|
9 |
-
# List from https://www.kaggle.com/discussions/general/116302.
|
10 |
-
licenses = [
|
11 |
-
"Other",
|
12 |
-
"Public Domain",
|
13 |
-
"Public",
|
14 |
-
"CC-0",
|
15 |
-
"PDDL",
|
16 |
-
"CC-BY",
|
17 |
-
"CDLA-Permissive-1.0",
|
18 |
-
"ODC-BY",
|
19 |
-
"CC-BY-SA",
|
20 |
-
"CDLA-Sharing-1.0",
|
21 |
-
"ODC-ODbL",
|
22 |
-
"CC BY-NC",
|
23 |
-
"CC BY-ND",
|
24 |
-
"CC BY-NC-SA",
|
25 |
-
"CC BY-NC-ND",
|
26 |
-
]
|
27 |
-
|
28 |
|
29 |
def render_metadata():
|
|
|
30 |
metadata = st.session_state[Metadata]
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
key = "metadata-license"
|
36 |
st.selectbox(
|
37 |
label="License",
|
|
|
|
|
|
|
|
|
38 |
key=key,
|
39 |
-
options=
|
40 |
index=index,
|
41 |
on_change=handle_metadata_change,
|
42 |
args=(MetadataEvent.LICENSE, metadata, key),
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
from core.state import Metadata
|
4 |
+
from events.metadata import find_license_index
|
5 |
from events.metadata import handle_metadata_change
|
6 |
+
from events.metadata import LICENSES
|
7 |
+
from events.metadata import LICENSES_URL
|
8 |
from events.metadata import MetadataEvent
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def render_metadata():
|
12 |
+
"""Renders the `Metadata` view."""
|
13 |
metadata = st.session_state[Metadata]
|
14 |
+
index = find_license_index(metadata.license)
|
15 |
+
key = "metadata-url"
|
16 |
+
st.text_input(
|
17 |
+
label="URL",
|
18 |
+
key=key,
|
19 |
+
value=metadata.url,
|
20 |
+
placeholder="URL to the dataset.",
|
21 |
+
on_change=handle_metadata_change,
|
22 |
+
args=(MetadataEvent.URL, metadata, key),
|
23 |
+
)
|
24 |
key = "metadata-license"
|
25 |
st.selectbox(
|
26 |
label="License",
|
27 |
+
help=(
|
28 |
+
"More information on license names and meaning can be found"
|
29 |
+
f" [here]({LICENSES_URL})."
|
30 |
+
),
|
31 |
key=key,
|
32 |
+
options=LICENSES.keys(),
|
33 |
index=index,
|
34 |
on_change=handle_metadata_change,
|
35 |
args=(MetadataEvent.LICENSE, metadata, key),
|
views/overview.py
CHANGED
@@ -42,17 +42,6 @@ def render_overview():
|
|
42 |
)
|
43 |
if not name:
|
44 |
st.stop()
|
45 |
-
key = "metadata-url"
|
46 |
-
url = st.text_input(
|
47 |
-
label=needed_field("URL"),
|
48 |
-
key=key,
|
49 |
-
value=metadata.url,
|
50 |
-
placeholder="URL to the dataset.",
|
51 |
-
on_change=handle_metadata_change,
|
52 |
-
args=(MetadataEvent.URL, metadata, key),
|
53 |
-
)
|
54 |
-
if not url:
|
55 |
-
st.stop()
|
56 |
key = "metadata-description"
|
57 |
st.text_area(
|
58 |
label="Description",
|
|
|
42 |
)
|
43 |
if not name:
|
44 |
st.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
key = "metadata-description"
|
46 |
st.text_area(
|
47 |
label="Description",
|
views/record_sets.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import multiprocessing
|
2 |
import textwrap
|
3 |
import time
|
|
|
4 |
from typing import TypedDict
|
5 |
|
6 |
import numpy as np
|
@@ -8,6 +9,10 @@ import pandas as pd
|
|
8 |
from rdflib import term
|
9 |
import streamlit as st
|
10 |
|
|
|
|
|
|
|
|
|
11 |
from core.query_params import expand_record_set
|
12 |
from core.query_params import is_record_set_expanded
|
13 |
from core.state import Field
|
@@ -23,14 +28,6 @@ from views.source import handle_field_change
|
|
23 |
from views.source import render_references
|
24 |
from views.source import render_source
|
25 |
|
26 |
-
DATA_TYPES = [
|
27 |
-
mlc.DataType.TEXT,
|
28 |
-
mlc.DataType.FLOAT,
|
29 |
-
mlc.DataType.INTEGER,
|
30 |
-
mlc.DataType.BOOL,
|
31 |
-
mlc.DataType.URL,
|
32 |
-
]
|
33 |
-
|
34 |
_NUM_RECORDS = 3
|
35 |
_TIMEOUT_SECONDS = 1
|
36 |
|
@@ -40,7 +37,16 @@ class _Result(TypedDict):
|
|
40 |
exception: Exception | None
|
41 |
|
42 |
|
43 |
-
@st.cache_data(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
|
45 |
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
46 |
with multiprocessing.Manager() as manager:
|
@@ -65,7 +71,7 @@ def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | Non
|
|
65 |
"""Generates the first _NUM_RECORDS records."""
|
66 |
try:
|
67 |
metadata: Metadata = st.session_state[Metadata]
|
68 |
-
if
|
69 |
raise ValueError(
|
70 |
"The dataset is still incomplete. Please, go to the overview to see"
|
71 |
" errors."
|
@@ -87,8 +93,8 @@ def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | Non
|
|
87 |
pass
|
88 |
df.append(record)
|
89 |
result["df"] = pd.DataFrame(df)
|
90 |
-
except Exception
|
91 |
-
result["exception"] =
|
92 |
|
93 |
|
94 |
def _handle_close_fields():
|
@@ -154,6 +160,10 @@ def _handle_create_record_set():
|
|
154 |
metadata.add_record_set(RecordSet(name="new-record-set", description=""))
|
155 |
|
156 |
|
|
|
|
|
|
|
|
|
157 |
def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
158 |
expand_record_set(record_set=record_set)
|
159 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
@@ -172,12 +182,13 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
|
172 |
elif new_field == FieldDataFrame.DESCRIPTION:
|
173 |
field.description = new_value
|
174 |
elif new_field == FieldDataFrame.DATA_TYPE:
|
175 |
-
field.data_types = [new_value]
|
176 |
for added_row in result["added_rows"]:
|
|
|
177 |
field = Field(
|
178 |
name=added_row.get(FieldDataFrame.NAME),
|
179 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
180 |
-
data_types=[
|
181 |
source=mlc.Source(),
|
182 |
references=mlc.Source(),
|
183 |
)
|
@@ -296,7 +307,7 @@ def _render_left_panel():
|
|
296 |
# TODO(https://github.com/mlcommons/croissant/issues/350): Allow to display
|
297 |
# several data types, not only the first.
|
298 |
data_types = [
|
299 |
-
field.data_types[0] if field.data_types else None
|
300 |
for field in record_set.fields
|
301 |
]
|
302 |
fields = pd.DataFrame(
|
@@ -331,7 +342,7 @@ def _render_left_panel():
|
|
331 |
FieldDataFrame.DATA_TYPE: st.column_config.SelectboxColumn(
|
332 |
FieldDataFrame.DATA_TYPE,
|
333 |
help="The Croissant type",
|
334 |
-
options=
|
335 |
required=True,
|
336 |
),
|
337 |
},
|
@@ -365,6 +376,14 @@ def _render_left_panel():
|
|
365 |
on_click=_handle_on_click_field,
|
366 |
args=(record_set_key, record_set),
|
367 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
st.button(
|
369 |
"Create a new RecordSet",
|
370 |
key=f"create-new-record-set",
|
@@ -432,8 +451,8 @@ def _render_right_panel():
|
|
432 |
data_type = field.data_types[0]
|
433 |
if isinstance(data_type, str):
|
434 |
data_type = term.URIRef(data_type)
|
435 |
-
if data_type in
|
436 |
-
data_type_index =
|
437 |
else:
|
438 |
data_type_index = None
|
439 |
else:
|
@@ -442,7 +461,7 @@ def _render_right_panel():
|
|
442 |
col3.selectbox(
|
443 |
needed_field("Data type"),
|
444 |
index=data_type_index,
|
445 |
-
options=
|
446 |
key=key,
|
447 |
on_change=handle_field_change,
|
448 |
args=(FieldEvent.DATA_TYPE, field, key),
|
|
|
1 |
import multiprocessing
|
2 |
import textwrap
|
3 |
import time
|
4 |
+
import traceback
|
5 |
from typing import TypedDict
|
6 |
|
7 |
import numpy as np
|
|
|
9 |
from rdflib import term
|
10 |
import streamlit as st
|
11 |
|
12 |
+
from core.data_types import MLC_DATA_TYPES
|
13 |
+
from core.data_types import mlc_to_str_data_type
|
14 |
+
from core.data_types import STR_DATA_TYPES
|
15 |
+
from core.data_types import str_to_mlc_data_type
|
16 |
from core.query_params import expand_record_set
|
17 |
from core.query_params import is_record_set_expanded
|
18 |
from core.state import Field
|
|
|
28 |
from views.source import render_references
|
29 |
from views.source import render_source
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
_NUM_RECORDS = 3
|
32 |
_TIMEOUT_SECONDS = 1
|
33 |
|
|
|
37 |
exception: Exception | None
|
38 |
|
39 |
|
40 |
+
@st.cache_data(
|
41 |
+
show_spinner="Generating the dataset...",
|
42 |
+
hash_funcs={
|
43 |
+
"mlcroissant.Metadata": hash,
|
44 |
+
"mlcroissant.Field": hash,
|
45 |
+
"mlcroissant.FileObject": hash,
|
46 |
+
"mlcroissant.FileSet": hash,
|
47 |
+
"mlcroissant.RecordSet": hash,
|
48 |
+
},
|
49 |
+
)
|
50 |
def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
|
51 |
"""Generates the data and waits at most _TIMEOUT_SECONDS."""
|
52 |
with multiprocessing.Manager() as manager:
|
|
|
71 |
"""Generates the first _NUM_RECORDS records."""
|
72 |
try:
|
73 |
metadata: Metadata = st.session_state[Metadata]
|
74 |
+
if metadata is None:
|
75 |
raise ValueError(
|
76 |
"The dataset is still incomplete. Please, go to the overview to see"
|
77 |
" errors."
|
|
|
93 |
pass
|
94 |
df.append(record)
|
95 |
result["df"] = pd.DataFrame(df)
|
96 |
+
except Exception:
|
97 |
+
result["exception"] = traceback.format_exc()
|
98 |
|
99 |
|
100 |
def _handle_close_fields():
|
|
|
160 |
metadata.add_record_set(RecordSet(name="new-record-set", description=""))
|
161 |
|
162 |
|
163 |
+
def _handle_remove_record_set(record_set_key: int):
|
164 |
+
del st.session_state[Metadata].record_sets[record_set_key]
|
165 |
+
|
166 |
+
|
167 |
def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
168 |
expand_record_set(record_set=record_set)
|
169 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
|
|
182 |
elif new_field == FieldDataFrame.DESCRIPTION:
|
183 |
field.description = new_value
|
184 |
elif new_field == FieldDataFrame.DATA_TYPE:
|
185 |
+
field.data_types = [str_to_mlc_data_type(new_value)]
|
186 |
for added_row in result["added_rows"]:
|
187 |
+
data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
|
188 |
field = Field(
|
189 |
name=added_row.get(FieldDataFrame.NAME),
|
190 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
191 |
+
data_types=[data_type],
|
192 |
source=mlc.Source(),
|
193 |
references=mlc.Source(),
|
194 |
)
|
|
|
307 |
# TODO(https://github.com/mlcommons/croissant/issues/350): Allow to display
|
308 |
# several data types, not only the first.
|
309 |
data_types = [
|
310 |
+
mlc_to_str_data_type(field.data_types[0]) if field.data_types else None
|
311 |
for field in record_set.fields
|
312 |
]
|
313 |
fields = pd.DataFrame(
|
|
|
342 |
FieldDataFrame.DATA_TYPE: st.column_config.SelectboxColumn(
|
343 |
FieldDataFrame.DATA_TYPE,
|
344 |
help="The Croissant type",
|
345 |
+
options=STR_DATA_TYPES,
|
346 |
required=True,
|
347 |
),
|
348 |
},
|
|
|
376 |
on_click=_handle_on_click_field,
|
377 |
args=(record_set_key, record_set),
|
378 |
)
|
379 |
+
key = f"{prefix}-delete-record-set"
|
380 |
+
st.button(
|
381 |
+
"⚠️ Delete RecordSet",
|
382 |
+
type="primary",
|
383 |
+
key=key,
|
384 |
+
on_click=_handle_remove_record_set,
|
385 |
+
args=(record_set_key,),
|
386 |
+
)
|
387 |
st.button(
|
388 |
"Create a new RecordSet",
|
389 |
key=f"create-new-record-set",
|
|
|
451 |
data_type = field.data_types[0]
|
452 |
if isinstance(data_type, str):
|
453 |
data_type = term.URIRef(data_type)
|
454 |
+
if data_type in MLC_DATA_TYPES:
|
455 |
+
data_type_index = MLC_DATA_TYPES.index(data_type)
|
456 |
else:
|
457 |
data_type_index = None
|
458 |
else:
|
|
|
461 |
col3.selectbox(
|
462 |
needed_field("Data type"),
|
463 |
index=data_type_index,
|
464 |
+
options=STR_DATA_TYPES,
|
465 |
key=key,
|
466 |
on_change=handle_field_change,
|
467 |
args=(FieldEvent.DATA_TYPE, field, key),
|