marcenacp commited on
Commit
e92e659
1 Parent(s): f82850d

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/Dockerfile
2
+
3
+ FROM python:3.10-slim
4
+
5
+ WORKDIR /app
6
+
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ curl \
10
+ software-properties-common \
11
+ git \
12
+ python3-pip \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY ./ /app/
16
+
17
+ RUN python3 -m pip install -r requirements.txt
18
+
19
+ EXPOSE 8501
20
+
21
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
22
+
23
+ ENTRYPOINT ["streamlit", "run", "app.py"]
core/data_types.py CHANGED
@@ -17,3 +17,30 @@ def convert_dtype(dtype: Any):
17
  return mlc.DataType.TEXT
18
  else:
19
  raise NotImplementedError(dtype)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  return mlc.DataType.TEXT
18
  else:
19
  raise NotImplementedError(dtype)
20
+
21
+
22
+ MLC_DATA_TYPES = [
23
+ mlc.DataType.TEXT,
24
+ mlc.DataType.FLOAT,
25
+ mlc.DataType.INTEGER,
26
+ mlc.DataType.BOOL,
27
+ mlc.DataType.URL,
28
+ ]
29
+
30
+ STR_DATA_TYPES = [
31
+ str(data_type).replace("https://schema.org/", "") for data_type in MLC_DATA_TYPES
32
+ ]
33
+
34
+
35
+ def str_to_mlc_data_type(data_type: str) -> mlc.DataType | None:
36
+ for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
37
+ if data_type == str_data_type:
38
+ return mlc_data_type
39
+ return None
40
+
41
+
42
+ def mlc_to_str_data_type(data_type: str) -> mlc.DataType | None:
43
+ for str_data_type, mlc_data_type in zip(STR_DATA_TYPES, MLC_DATA_TYPES):
44
+ if data_type == mlc_data_type:
45
+ return str_data_type
46
+ return None
core/data_types_test.py CHANGED
@@ -3,7 +3,10 @@
3
  import numpy as np
4
  import pytest
5
 
 
 
6
  from .data_types import convert_dtype
 
7
 
8
 
9
  def test_convert_dtype():
@@ -13,3 +16,8 @@ def test_convert_dtype():
13
  convert_dtype(np.str_) == "https://schema.org/Text"
14
  with pytest.raises(NotImplementedError):
15
  convert_dtype(np.float32)
 
 
 
 
 
 
3
  import numpy as np
4
  import pytest
5
 
6
+ import mlcroissant as mlc
7
+
8
  from .data_types import convert_dtype
9
+ from .data_types import str_to_mlc_data_type
10
 
11
 
12
  def test_convert_dtype():
 
16
  convert_dtype(np.str_) == "https://schema.org/Text"
17
  with pytest.raises(NotImplementedError):
18
  convert_dtype(np.float32)
19
+
20
+
21
+ def test_str_to_mlc_data_type():
22
+ assert str_to_mlc_data_type("Integer") == mlc.DataType.INTEGER
23
+ assert str_to_mlc_data_type("Foo") == None
core/files.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  import requests
9
 
10
  from .names import find_unique_name
 
11
  from .state import FileObject
12
  from .state import FileSet
13
 
@@ -30,6 +31,8 @@ class FileTypes:
30
  encoding_format="application/vnd.ms-excel",
31
  extensions=["xls", "xlsx", "xlsm"],
32
  )
 
 
33
  JSON = FileType(
34
  name="JSON", encoding_format="application/json", extensions=["json"]
35
  )
@@ -43,20 +46,60 @@ class FileTypes:
43
  encoding_format="application/vnd.apache.parquet",
44
  extensions=["parquet"],
45
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  FILE_TYPES: dict[str, FileType] = {
49
- file_type.name: file_type
50
  for file_type in [
51
  FileTypes.CSV,
52
  FileTypes.EXCEL,
 
 
53
  FileTypes.JSON,
54
  FileTypes.JSONL,
55
  FileTypes.PARQUET,
 
 
 
56
  ]
57
  }
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def _sha256(content: bytes):
61
  """Computes the sha256 digest of the byte string."""
62
  return hashlib.sha256(content).hexdigest()
@@ -97,7 +140,9 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
97
  raise NotImplementedError()
98
 
99
 
100
- def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
 
 
101
  """Downloads locally and extracts the file information."""
102
  file_path = hash_file_path(url)
103
  if not file_path.exists():
@@ -112,30 +157,38 @@ def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
112
  encoding_format=file_type.encoding_format,
113
  sha256=sha256,
114
  df=df,
 
115
  )
116
 
117
 
118
  def file_from_upload(
119
- file_type: FileType, file: io.BytesIO, names: set[str]
120
  ) -> FileObject:
121
  """Uploads locally and extracts the file information."""
122
- sha256 = _sha256(file.getvalue())
 
 
 
 
123
  df = get_dataframe(file_type, file).infer_objects()
124
  return FileObject(
125
  name=find_unique_name(names, file.name),
126
  description="",
127
- content_url=f"data/{file.name}",
128
  encoding_format=file_type.encoding_format,
129
  sha256=sha256,
130
  df=df,
 
131
  )
132
 
133
 
134
- def file_from_form(type: str, names: set[str]) -> FileObject | FileSet:
 
 
135
  """Creates a file based on manually added fields."""
136
  if type == FILE_OBJECT:
137
- return FileObject(name=find_unique_name(names, "file_object"))
138
  elif type == FILE_SET:
139
- return FileSet(name=find_unique_name(names, "file_set"))
140
  else:
141
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
 
8
  import requests
9
 
10
  from .names import find_unique_name
11
+ from .path import get_resource_path
12
  from .state import FileObject
13
  from .state import FileSet
14
 
 
31
  encoding_format="application/vnd.ms-excel",
32
  extensions=["xls", "xlsx", "xlsm"],
33
  )
34
+ GZIP = FileType(name="GZIP", encoding_format="application/gzip", extensions=["gz"])
35
+ JPEG = FileType(name="JPEG", encoding_format="image/jpeg", extensions=["json"])
36
  JSON = FileType(
37
  name="JSON", encoding_format="application/json", extensions=["json"]
38
  )
 
46
  encoding_format="application/vnd.apache.parquet",
47
  extensions=["parquet"],
48
  )
49
+ TAR = FileType(
50
+ name="Archive (TAR)",
51
+ encoding_format="application/x-tar",
52
+ extensions=["tar"],
53
+ )
54
+ TXT = FileType(
55
+ name="Text",
56
+ encoding_format="plain/text",
57
+ extensions=["txt"],
58
+ )
59
+ ZIP = FileType(
60
+ name="ZIP",
61
+ encoding_format="application/zip",
62
+ extensions=["zip"],
63
+ )
64
+
65
+
66
+ def _full_name(file_type: FileType):
67
+ return f"{file_type.name} ({file_type.encoding_format})"
68
 
69
 
70
  FILE_TYPES: dict[str, FileType] = {
71
+ _full_name(file_type): file_type
72
  for file_type in [
73
  FileTypes.CSV,
74
  FileTypes.EXCEL,
75
+ FileTypes.GZIP,
76
+ FileTypes.JPEG,
77
  FileTypes.JSON,
78
  FileTypes.JSONL,
79
  FileTypes.PARQUET,
80
+ FileTypes.TAR,
81
+ FileTypes.TXT,
82
+ FileTypes.ZIP,
83
  ]
84
  }
85
 
86
 
87
+ def name_to_code(file_type_name: str) -> str | None:
88
+ """Maps names to the encoding format: Text => plain/text."""
89
+ for name, file_type in FILE_TYPES.items():
90
+ if file_type_name == name:
91
+ return file_type.encoding_format
92
+ return None
93
+
94
+
95
+ def code_to_index(encoding_format: str) -> int | None:
96
+ """Maps the encoding format to its index in the list of keys: plain/text => 12."""
97
+ for i, file_type in enumerate(FILE_TYPES.values()):
98
+ if file_type.encoding_format == encoding_format:
99
+ return i
100
+ return None
101
+
102
+
103
  def _sha256(content: bytes):
104
  """Computes the sha256 digest of the byte string."""
105
  return hashlib.sha256(content).hexdigest()
 
140
  raise NotImplementedError()
141
 
142
 
143
+ def file_from_url(
144
+ file_type: FileType, url: str, names: set[str], folder: epath.Path
145
+ ) -> FileObject:
146
  """Downloads locally and extracts the file information."""
147
  file_path = hash_file_path(url)
148
  if not file_path.exists():
 
157
  encoding_format=file_type.encoding_format,
158
  sha256=sha256,
159
  df=df,
160
+ folder=folder,
161
  )
162
 
163
 
164
  def file_from_upload(
165
+ file_type: FileType, file: io.BytesIO, names: set[str], folder: epath.Path
166
  ) -> FileObject:
167
  """Uploads locally and extracts the file information."""
168
+ value = file.getvalue()
169
+ content_url = f"data/{file.name}"
170
+ sha256 = _sha256(value)
171
+ with get_resource_path(content_url).open("wb") as f:
172
+ f.write(value)
173
  df = get_dataframe(file_type, file).infer_objects()
174
  return FileObject(
175
  name=find_unique_name(names, file.name),
176
  description="",
177
+ content_url=content_url,
178
  encoding_format=file_type.encoding_format,
179
  sha256=sha256,
180
  df=df,
181
+ folder=folder,
182
  )
183
 
184
 
185
+ def file_from_form(
186
+ type: str, names: set[str], folder: epath.Path
187
+ ) -> FileObject | FileSet:
188
  """Creates a file based on manually added fields."""
189
  if type == FILE_OBJECT:
190
+ return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
191
  elif type == FILE_SET:
192
+ return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
193
  else:
194
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
core/files_test.py CHANGED
@@ -18,10 +18,10 @@ def test_check_file_csv():
18
  f.write("a,1\n")
19
  f.write("b,2\n")
20
  f.write("c,3\n")
21
- file = file_from_url(FileTypes.CSV, "https://my.url", set())
22
  pd.testing.assert_frame_equal(
23
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
24
  )
25
  # Fails with unknown encoding_format:
26
  with pytest.raises(NotImplementedError):
27
- file_from_url("unknown", "https://my.url", set())
 
18
  f.write("a,1\n")
19
  f.write("b,2\n")
20
  f.write("c,3\n")
21
+ file = file_from_url(FileTypes.CSV, "https://my.url", set(), epath.Path())
22
  pd.testing.assert_frame_equal(
23
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
24
  )
25
  # Fails with unknown encoding_format:
26
  with pytest.raises(NotImplementedError):
27
+ file_from_url("unknown", "https://my.url", set(), epath.Path())
core/path.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from etils import epath
2
+ import streamlit as st
3
+
4
+ from core.state import CurrentProject
5
+
6
+
7
+ def get_resource_path(content_url: str) -> epath.Path:
8
+ """Gets the path on disk of the resource with `content_url`."""
9
+ project: CurrentProject = st.session_state[CurrentProject]
10
+ path = project.path / content_url
11
+ if not path.parent.exists():
12
+ path.parent.mkdir(parents=True, exist_ok=True)
13
+ return path
core/state.py CHANGED
@@ -137,6 +137,7 @@ class FileObject:
137
  sha256: str | None = None
138
  df: pd.DataFrame | None = None
139
  rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
 
140
 
141
 
142
  @dataclasses.dataclass
 
137
  sha256: str | None = None
138
  df: pd.DataFrame | None = None
139
  rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
140
+ folder: epath.PathLike | None = None
141
 
142
 
143
  @dataclasses.dataclass
events/fields.py CHANGED
@@ -3,6 +3,7 @@ from typing import Any
3
 
4
  import streamlit as st
5
 
 
6
  from core.state import Field
7
  from core.state import Metadata
8
  import mlcroissant as mlc
@@ -88,7 +89,7 @@ def handle_field_change(
88
  elif change == FieldEvent.DESCRIPTION:
89
  field.description = value
90
  elif change == FieldEvent.DATA_TYPE:
91
- field.data_types = [value]
92
  elif change == FieldEvent.SOURCE:
93
  node_type = "field" if "/" in value else "distribution"
94
  source = mlc.Source(uid=value, node_type=node_type)
 
3
 
4
  import streamlit as st
5
 
6
+ from core.data_types import str_to_mlc_data_type
7
  from core.state import Field
8
  from core.state import Metadata
9
  import mlcroissant as mlc
 
89
  elif change == FieldEvent.DESCRIPTION:
90
  field.description = value
91
  elif change == FieldEvent.DATA_TYPE:
92
+ field.data_types = [str_to_mlc_data_type(value)]
93
  elif change == FieldEvent.SOURCE:
94
  node_type = "field" if "/" in value else "distribution"
95
  source = mlc.Source(uid=value, node_type=node_type)
events/metadata.py CHANGED
@@ -4,6 +4,86 @@ import streamlit as st
4
 
5
  from core.state import Metadata
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class MetadataEvent(enum.Enum):
9
  """Event that triggers a metadata change."""
@@ -21,7 +101,7 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
21
  elif event == MetadataEvent.DESCRIPTION:
22
  metadata.description = st.session_state[key]
23
  elif event == MetadataEvent.LICENSE:
24
- metadata.license = st.session_state[key]
25
  elif event == MetadataEvent.CITATION:
26
  metadata.citation = st.session_state[key]
27
  elif event == MetadataEvent.URL:
 
4
 
5
  from core.state import Metadata
6
 
7
+ # List from:
8
+ LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses"
9
+ LICENSES = {
10
+ "Unknown": "unknown",
11
+ "Other": "other",
12
+ "Apache license 2.0": "apache-2.0",
13
+ "MIT": "mit",
14
+ "OpenRAIL license family": "openrail",
15
+ "BigScience OpenRAIL-M": "bigscience-openrail-m",
16
+ "CreativeML OpenRAIL-M": "creativeml-openrail-m",
17
+ "BigScience BLOOM RAIL 1.0": "bigscience-bloom-rail-1.0",
18
+ "BigCode Open RAIL-M v1": "bigcode-openrail-m",
19
+ "Academic Free License v3.0": "afl-3.0",
20
+ "Artistic license 2.0": "artistic-2.0",
21
+ "Boost Software License 1.0": "bsl-1.0",
22
+ "BSD license family": "bsd",
23
+ "BSD 2-clause “Simplified” license": "bsd-2-clause",
24
+ "BSD 3-clause “New” or “Revised” license": "bsd-3-clause",
25
+ "BSD 3-clause Clear license": "bsd-3-clause-clear",
26
+ "Computational Use of Data Agreement": "c-uda",
27
+ "Creative Commons license family": "cc",
28
+ "Creative Commons Zero v1.0 Universal": "cc0-1.0",
29
+ "Creative Commons Attribution 2.0": "cc-by-2.0",
30
+ "Creative Commons Attribution 2.5": "cc-by-2.5",
31
+ "Creative Commons Attribution 3.0": "cc-by-3.0",
32
+ "Creative Commons Attribution 4.0": "cc-by-4.0",
33
+ "Creative Commons Attribution Share Alike 3.0": "cc-by-sa-3.0",
34
+ "Creative Commons Attribution Share Alike 4.0": "cc-by-sa-4.0",
35
+ "Creative Commons Attribution Non Commercial 2.0": "cc-by-nc-2.0",
36
+ "Creative Commons Attribution Non Commercial 3.0": "cc-by-nc-3.0",
37
+ "Creative Commons Attribution Non Commercial 4.0": "cc-by-nc-4.0",
38
+ "Creative Commons Attribution No Derivatives 4.0": "cc-by-nd-4.0",
39
+ "Creative Commons Attribution Non Commercial No Derivatives 3.0": "cc-by-nc-nd-3.0",
40
+ "Creative Commons Attribution Non Commercial No Derivatives 4.0": "cc-by-nc-nd-4.0",
41
+ "Creative Commons Attribution Non Commercial Share Alike 2.0": "cc-by-nc-sa-2.0",
42
+ "Creative Commons Attribution Non Commercial Share Alike 3.0": "cc-by-nc-sa-3.0",
43
+ "Creative Commons Attribution Non Commercial Share Alike 4.0": "cc-by-nc-sa-4.0",
44
+ "Community Data License Agreement – Sharing, Version 1.0": "cdla-sharing-1.0",
45
+ "Community Data License Agreement – Permissive, Version 1.0": "cdla-permissive-1.0",
46
+ "Community Data License Agreement – Permissive, Version 2.0": "cdla-permissive-2.0",
47
+ "Do What The F*ck You Want To Public License": "wtfpl",
48
+ "Educational Community License v2.0": "ecl-2.0",
49
+ "Eclipse Public License 1.0": "epl-1.0",
50
+ "Eclipse Public License 2.0": "epl-2.0",
51
+ "European Union Public License 1.1": "eupl-1.1",
52
+ "GNU Affero General Public License v3.0": "agpl-3.0",
53
+ "GNU Free Documentation License family": "gfdl",
54
+ "GNU General Public License family": "gpl",
55
+ "GNU General Public License v2.0": "gpl-2.0",
56
+ "GNU General Public License v3.0": "gpl-3.0",
57
+ "GNU Lesser General Public License family": "lgpl",
58
+ "GNU Lesser General Public License v2.1": "lgpl-2.1",
59
+ "GNU Lesser General Public License v3.0": "lgpl-3.0",
60
+ "ISC": "isc",
61
+ "LaTeX Project Public License v1.3c": "lppl-1.3c",
62
+ "Microsoft Public License": "ms-pl",
63
+ "Mozilla Public License 2.0": "mpl-2.0",
64
+ "Open Data Commons License Attribution family": "odc-by",
65
+ "Open Database License family": "odbl",
66
+ "Open Rail++-M License": "openrail++",
67
+ "Open Software License 3.0": "osl-3.0",
68
+ "PostgreSQL License": "postgresql",
69
+ "SIL Open Font License 1.1": "ofl-1.1",
70
+ "University of Illinois/NCSA Open Source License": "ncsa",
71
+ "The Unlicense": "unlicense",
72
+ "zLib License": "zlib",
73
+ "Open Data Commons Public Domain Dedication and License": "pddl",
74
+ "Lesser General Public License For Linguistic Resources": "lgpl-lr",
75
+ "DeepFloyd IF Research License Agreement": "deepfloyd-if-license",
76
+ "Llama 2 Community License Agreement": "llama2",
77
+ }
78
+
79
+
80
+ def find_license_index(code: str) -> int | None:
81
+ """Finds the index in the list of LICENSES."""
82
+ for index, license_code in enumerate(LICENSES.values()):
83
+ if license_code == code:
84
+ return index
85
+ return None
86
+
87
 
88
  class MetadataEvent(enum.Enum):
89
  """Event that triggers a metadata change."""
 
101
  elif event == MetadataEvent.DESCRIPTION:
102
  metadata.description = st.session_state[key]
103
  elif event == MetadataEvent.LICENSE:
104
+ metadata.license = LICENSES.get(st.session_state[key])
105
  elif event == MetadataEvent.CITATION:
106
  metadata.citation = st.session_state[key]
107
  elif event == MetadataEvent.URL:
events/metadata_test.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .metadata import find_license_index
2
+
3
+
4
+ def test_find_license_index():
5
+ assert find_license_index("unknown") == 0
6
+ assert find_license_index("llama2") == 66
7
+ assert find_license_index("fooo") is None
events/resources.py CHANGED
@@ -4,6 +4,8 @@ import enum
4
  import streamlit as st
5
 
6
  from core.files import FILE_OBJECT
 
 
7
  from core.state import FileObject
8
  from core.state import FileSet
9
  from core.state import Metadata
@@ -37,7 +39,7 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
37
  elif event == ResourceEvent.DESCRIPTION:
38
  resource.description = value
39
  elif event == ResourceEvent.ENCODING_FORMAT:
40
- resource.encoding_format = value
41
  elif event == ResourceEvent.INCLUDES:
42
  resource.includes = value
43
  elif event == ResourceEvent.SHA256:
@@ -47,6 +49,11 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
47
  elif event == ResourceEvent.CONTENT_SIZE:
48
  resource.content_size = value
49
  elif event == ResourceEvent.CONTENT_URL:
 
 
 
 
 
50
  resource.content_url = value
51
  elif event == ResourceEvent.TYPE:
52
  metadata: Metadata = st.session_state[Metadata]
 
4
  import streamlit as st
5
 
6
  from core.files import FILE_OBJECT
7
+ from core.files import name_to_code
8
+ from core.path import get_resource_path
9
  from core.state import FileObject
10
  from core.state import FileSet
11
  from core.state import Metadata
 
39
  elif event == ResourceEvent.DESCRIPTION:
40
  resource.description = value
41
  elif event == ResourceEvent.ENCODING_FORMAT:
42
+ resource.encoding_format = name_to_code(value)
43
  elif event == ResourceEvent.INCLUDES:
44
  resource.includes = value
45
  elif event == ResourceEvent.SHA256:
 
49
  elif event == ResourceEvent.CONTENT_SIZE:
50
  resource.content_size = value
51
  elif event == ResourceEvent.CONTENT_URL:
52
+ if resource.content_url and value:
53
+ old_path = get_resource_path(resource.content_url)
54
+ new_path = get_resource_path(value)
55
+ if old_path.exists() and not new_path.exists():
56
+ old_path.rename(new_path)
57
  resource.content_url = value
58
  elif event == ResourceEvent.TYPE:
59
  metadata: Metadata = st.session_state[Metadata]
utils.py CHANGED
@@ -41,14 +41,3 @@ def init_state(force=False):
41
 
42
  if OpenTab not in st.session_state or force:
43
  st.session_state[OpenTab] = None
44
-
45
- # Uncomment those lines if you work locally in order to avoid clicks at each reload.
46
- # And comment all previous lines in `init_state`.
47
- # if mlc.Dataset not in st.session_state or force:
48
- # st.session_state[mlc.Dataset] = mlc.Dataset("../datasets/titanic/metadata.json")
49
- # if Metadata not in st.session_state or force:
50
- # st.session_state[Metadata] = Metadata.from_canonical(
51
- # st.session_state[mlc.Dataset].metadata
52
- # )
53
- # if CurrentProject not in st.session_state or force:
54
- # st.session_state[CurrentProject] = CurrentProject.create_new()
 
41
 
42
  if OpenTab not in st.session_state or force:
43
  st.session_state[OpenTab] = None
 
 
 
 
 
 
 
 
 
 
 
views/files.py CHANGED
@@ -1,7 +1,10 @@
 
1
  import streamlit as st
2
 
3
  from components.tree import render_tree
4
  from core.constants import DF_HEIGHT
 
 
5
  from core.files import file_from_form
6
  from core.files import file_from_upload
7
  from core.files import file_from_url
@@ -9,7 +12,9 @@ from core.files import FILE_OBJECT
9
  from core.files import FILE_SET
10
  from core.files import FILE_TYPES
11
  from core.files import RESOURCE_TYPES
 
12
  from core.record_sets import infer_record_sets
 
13
  from core.state import FileObject
14
  from core.state import FileSet
15
  from core.state import Metadata
@@ -23,10 +28,6 @@ Resource = FileObject | FileSet
23
  _DISTANT_URL_KEY = "import_from_url"
24
  _LOCAL_FILE_KEY = "import_from_local_file"
25
  _MANUAL_RESOURCE_TYPE_KEY = "create_manually_type"
26
- _MANUAL_NAME_KEY = "manual_object_name"
27
- _MANUAL_DESCRIPTION_KEY = "manual_object_description"
28
- _MANUAL_SHA256_KEY = "manual_object_sha256"
29
- _MANUAL_PARENT_KEY = "manual_object_parents"
30
 
31
  _INFO = """Resources can be `FileObjects` (single files) or `FileSets` (sets of files
32
  with the same MIME type). On this page, you can upload `FileObjects`, point to external
@@ -34,6 +35,8 @@ resources on the web or manually create new resources."""
34
 
35
 
36
  def render_files():
 
 
37
  col1, col2, col3 = st.columns([1, 1, 1], gap="small")
38
  with col1:
39
  st.markdown("##### Upload more resources")
@@ -47,6 +50,31 @@ def render_files():
47
  _render_right_panel()
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def _render_resources_panel(files: list[Resource]) -> Resource | None:
51
  """Renders the left panel: the list of all resources."""
52
  filename_to_file: dict[str, list[Resource]] = {}
@@ -99,13 +127,15 @@ def _render_upload_panel():
99
  file_type = FILE_TYPES[file_type_name]
100
  metadata: Metadata = st.session_state[Metadata]
101
  names = metadata.names()
 
 
102
  if url:
103
- file = file_from_url(file_type, url, names)
104
  elif uploaded_file:
105
- file = file_from_upload(file_type, uploaded_file, names)
106
  else:
107
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
108
- file = file_from_form(resource_type, names)
109
 
110
  st.session_state[Metadata].add_distribution(file)
111
  record_sets = infer_record_sets(file, names)
@@ -157,7 +187,7 @@ def _render_resource_details(selected_file: Resource):
157
  col1, col2 = st.columns([1, 1])
158
  col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
159
  col2.button(
160
- "Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
161
  )
162
 
163
 
@@ -224,9 +254,10 @@ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bo
224
  args=(ResourceEvent.INCLUDES, file, key),
225
  )
226
  key = f"{prefix}_encoding"
227
- st.text_input(
228
  needed_field("Encoding format"),
229
- value=file.encoding_format,
 
230
  key=key,
231
  on_change=handle_resource_change,
232
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
 
1
+ from etils import epath
2
  import streamlit as st
3
 
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
6
+ from core.constants import OAUTH_CLIENT_ID
7
+ from core.files import code_to_index
8
  from core.files import file_from_form
9
  from core.files import file_from_upload
10
  from core.files import file_from_url
 
12
  from core.files import FILE_SET
13
  from core.files import FILE_TYPES
14
  from core.files import RESOURCE_TYPES
15
+ from core.path import get_resource_path
16
  from core.record_sets import infer_record_sets
17
+ from core.state import CurrentProject
18
  from core.state import FileObject
19
  from core.state import FileSet
20
  from core.state import Metadata
 
28
  _DISTANT_URL_KEY = "import_from_url"
29
  _LOCAL_FILE_KEY = "import_from_local_file"
30
  _MANUAL_RESOURCE_TYPE_KEY = "create_manually_type"
 
 
 
 
31
 
32
  _INFO = """Resources can be `FileObjects` (single files) or `FileSets` (sets of files
33
  with the same MIME type). On this page, you can upload `FileObjects`, point to external
 
35
 
36
 
37
  def render_files():
38
+ """Renders the views of the files: warnings and panels to display information."""
39
+ _render_warnings()
40
  col1, col2, col3 = st.columns([1, 1, 1], gap="small")
41
  with col1:
42
  st.markdown("##### Upload more resources")
 
50
  _render_right_panel()
51
 
52
 
53
+ def _render_warnings():
54
+ """Renders warnings concerning local files."""
55
+ metadata: Metadata = st.session_state[Metadata]
56
+ warning = ""
57
+ for resource in metadata.distribution:
58
+ content_url = resource.content_url
59
+ if content_url and not content_url.startswith("http"):
60
+ path = get_resource_path(content_url)
61
+ if not path.exists():
62
+ if OAUTH_CLIENT_ID:
63
+ warning += (
64
+ f'⚠️ Resource "{resource.name}" points to a local file, but'
65
+ " doesn't exist on the disk. Fix this by changing the content"
66
+ " URL.\n\n"
67
+ )
68
+ else:
69
+ warning += (
70
+ f'⚠️ Resource "{resource.name}" points to a local file, but'
71
+ " doesn't exist on the disk. Fix this by either downloading"
72
+ f" it to {path} or changing the content URL.\n\n"
73
+ )
74
+ if warning:
75
+ st.warning(warning.strip())
76
+
77
+
78
  def _render_resources_panel(files: list[Resource]) -> Resource | None:
79
  """Renders the left panel: the list of all resources."""
80
  filename_to_file: dict[str, list[Resource]] = {}
 
127
  file_type = FILE_TYPES[file_type_name]
128
  metadata: Metadata = st.session_state[Metadata]
129
  names = metadata.names()
130
+ project: CurrentProject = st.session_state[CurrentProject]
131
+ folder = project.path
132
  if url:
133
+ file = file_from_url(file_type, url, names, folder)
134
  elif uploaded_file:
135
+ file = file_from_upload(file_type, uploaded_file, names, folder)
136
  else:
137
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
138
+ file = file_from_form(resource_type, names, folder)
139
 
140
  st.session_state[Metadata].add_distribution(file)
141
  record_sets = infer_record_sets(file, names)
 
187
  col1, col2 = st.columns([1, 1])
188
  col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
189
  col2.button(
190
+ "⚠️ Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
191
  )
192
 
193
 
 
254
  args=(ResourceEvent.INCLUDES, file, key),
255
  )
256
  key = f"{prefix}_encoding"
257
+ st.selectbox(
258
  needed_field("Encoding format"),
259
+ index=code_to_index(file.encoding_format),
260
+ options=FILE_TYPES.keys(),
261
  key=key,
262
  on_change=handle_resource_change,
263
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
views/metadata.py CHANGED
@@ -1,42 +1,35 @@
1
- import enum
2
-
3
  import streamlit as st
4
 
5
  from core.state import Metadata
 
6
  from events.metadata import handle_metadata_change
 
 
7
  from events.metadata import MetadataEvent
8
 
9
- # List from https://www.kaggle.com/discussions/general/116302.
10
- licenses = [
11
- "Other",
12
- "Public Domain",
13
- "Public",
14
- "CC-0",
15
- "PDDL",
16
- "CC-BY",
17
- "CDLA-Permissive-1.0",
18
- "ODC-BY",
19
- "CC-BY-SA",
20
- "CDLA-Sharing-1.0",
21
- "ODC-ODbL",
22
- "CC BY-NC",
23
- "CC BY-ND",
24
- "CC BY-NC-SA",
25
- "CC BY-NC-ND",
26
- ]
27
-
28
 
29
  def render_metadata():
 
30
  metadata = st.session_state[Metadata]
31
- try:
32
- index = licenses.index(metadata.license)
33
- except ValueError:
34
- index = None
 
 
 
 
 
 
35
  key = "metadata-license"
36
  st.selectbox(
37
  label="License",
 
 
 
 
38
  key=key,
39
- options=licenses,
40
  index=index,
41
  on_change=handle_metadata_change,
42
  args=(MetadataEvent.LICENSE, metadata, key),
 
 
 
1
  import streamlit as st
2
 
3
  from core.state import Metadata
4
+ from events.metadata import find_license_index
5
  from events.metadata import handle_metadata_change
6
+ from events.metadata import LICENSES
7
+ from events.metadata import LICENSES_URL
8
  from events.metadata import MetadataEvent
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def render_metadata():
12
+ """Renders the `Metadata` view."""
13
  metadata = st.session_state[Metadata]
14
+ index = find_license_index(metadata.license)
15
+ key = "metadata-url"
16
+ st.text_input(
17
+ label="URL",
18
+ key=key,
19
+ value=metadata.url,
20
+ placeholder="URL to the dataset.",
21
+ on_change=handle_metadata_change,
22
+ args=(MetadataEvent.URL, metadata, key),
23
+ )
24
  key = "metadata-license"
25
  st.selectbox(
26
  label="License",
27
+ help=(
28
+ "More information on license names and meaning can be found"
29
+ f" [here]({LICENSES_URL})."
30
+ ),
31
  key=key,
32
+ options=LICENSES.keys(),
33
  index=index,
34
  on_change=handle_metadata_change,
35
  args=(MetadataEvent.LICENSE, metadata, key),
views/overview.py CHANGED
@@ -42,17 +42,6 @@ def render_overview():
42
  )
43
  if not name:
44
  st.stop()
45
- key = "metadata-url"
46
- url = st.text_input(
47
- label=needed_field("URL"),
48
- key=key,
49
- value=metadata.url,
50
- placeholder="URL to the dataset.",
51
- on_change=handle_metadata_change,
52
- args=(MetadataEvent.URL, metadata, key),
53
- )
54
- if not url:
55
- st.stop()
56
  key = "metadata-description"
57
  st.text_area(
58
  label="Description",
 
42
  )
43
  if not name:
44
  st.stop()
 
 
 
 
 
 
 
 
 
 
 
45
  key = "metadata-description"
46
  st.text_area(
47
  label="Description",
views/record_sets.py CHANGED
@@ -1,6 +1,7 @@
1
  import multiprocessing
2
  import textwrap
3
  import time
 
4
  from typing import TypedDict
5
 
6
  import numpy as np
@@ -8,6 +9,10 @@ import pandas as pd
8
  from rdflib import term
9
  import streamlit as st
10
 
 
 
 
 
11
  from core.query_params import expand_record_set
12
  from core.query_params import is_record_set_expanded
13
  from core.state import Field
@@ -23,14 +28,6 @@ from views.source import handle_field_change
23
  from views.source import render_references
24
  from views.source import render_source
25
 
26
- DATA_TYPES = [
27
- mlc.DataType.TEXT,
28
- mlc.DataType.FLOAT,
29
- mlc.DataType.INTEGER,
30
- mlc.DataType.BOOL,
31
- mlc.DataType.URL,
32
- ]
33
-
34
  _NUM_RECORDS = 3
35
  _TIMEOUT_SECONDS = 1
36
 
@@ -40,7 +37,16 @@ class _Result(TypedDict):
40
  exception: Exception | None
41
 
42
 
43
- @st.cache_data(show_spinner="Generating the dataset...")
 
 
 
 
 
 
 
 
 
44
  def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
45
  """Generates the data and waits at most _TIMEOUT_SECONDS."""
46
  with multiprocessing.Manager() as manager:
@@ -65,7 +71,7 @@ def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | Non
65
  """Generates the first _NUM_RECORDS records."""
66
  try:
67
  metadata: Metadata = st.session_state[Metadata]
68
- if not metadata:
69
  raise ValueError(
70
  "The dataset is still incomplete. Please, go to the overview to see"
71
  " errors."
@@ -87,8 +93,8 @@ def _generate_data(record_set: RecordSet, result: _Result) -> pd.DataFrame | Non
87
  pass
88
  df.append(record)
89
  result["df"] = pd.DataFrame(df)
90
- except Exception as exception:
91
- result["exception"] = exception
92
 
93
 
94
  def _handle_close_fields():
@@ -154,6 +160,10 @@ def _handle_create_record_set():
154
  metadata.add_record_set(RecordSet(name="new-record-set", description=""))
155
 
156
 
 
 
 
 
157
  def _handle_fields_change(record_set_key: int, record_set: RecordSet):
158
  expand_record_set(record_set=record_set)
159
  data_editor_key = _data_editor_key(record_set_key, record_set)
@@ -172,12 +182,13 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
172
  elif new_field == FieldDataFrame.DESCRIPTION:
173
  field.description = new_value
174
  elif new_field == FieldDataFrame.DATA_TYPE:
175
- field.data_types = [new_value]
176
  for added_row in result["added_rows"]:
 
177
  field = Field(
178
  name=added_row.get(FieldDataFrame.NAME),
179
  description=added_row.get(FieldDataFrame.DESCRIPTION),
180
- data_types=[added_row.get(FieldDataFrame.DATA_TYPE)],
181
  source=mlc.Source(),
182
  references=mlc.Source(),
183
  )
@@ -296,7 +307,7 @@ def _render_left_panel():
296
  # TODO(https://github.com/mlcommons/croissant/issues/350): Allow to display
297
  # several data types, not only the first.
298
  data_types = [
299
- field.data_types[0] if field.data_types else None
300
  for field in record_set.fields
301
  ]
302
  fields = pd.DataFrame(
@@ -331,7 +342,7 @@ def _render_left_panel():
331
  FieldDataFrame.DATA_TYPE: st.column_config.SelectboxColumn(
332
  FieldDataFrame.DATA_TYPE,
333
  help="The Croissant type",
334
- options=DATA_TYPES,
335
  required=True,
336
  ),
337
  },
@@ -365,6 +376,14 @@ def _render_left_panel():
365
  on_click=_handle_on_click_field,
366
  args=(record_set_key, record_set),
367
  )
 
 
 
 
 
 
 
 
368
  st.button(
369
  "Create a new RecordSet",
370
  key=f"create-new-record-set",
@@ -432,8 +451,8 @@ def _render_right_panel():
432
  data_type = field.data_types[0]
433
  if isinstance(data_type, str):
434
  data_type = term.URIRef(data_type)
435
- if data_type in DATA_TYPES:
436
- data_type_index = DATA_TYPES.index(data_type)
437
  else:
438
  data_type_index = None
439
  else:
@@ -442,7 +461,7 @@ def _render_right_panel():
442
  col3.selectbox(
443
  needed_field("Data type"),
444
  index=data_type_index,
445
- options=DATA_TYPES,
446
  key=key,
447
  on_change=handle_field_change,
448
  args=(FieldEvent.DATA_TYPE, field, key),
 
1
  import multiprocessing
2
  import textwrap
3
  import time
4
+ import traceback
5
  from typing import TypedDict
6
 
7
  import numpy as np
 
9
  from rdflib import term
10
  import streamlit as st
11
 
12
+ from core.data_types import MLC_DATA_TYPES
13
+ from core.data_types import mlc_to_str_data_type
14
+ from core.data_types import STR_DATA_TYPES
15
+ from core.data_types import str_to_mlc_data_type
16
  from core.query_params import expand_record_set
17
  from core.query_params import is_record_set_expanded
18
  from core.state import Field
 
28
  from views.source import render_references
29
  from views.source import render_source
30
 
 
 
 
 
 
 
 
 
31
  _NUM_RECORDS = 3
32
  _TIMEOUT_SECONDS = 1
33
 
 
37
  exception: Exception | None
38
 
39
 
40
+ @st.cache_data(
41
+ show_spinner="Generating the dataset...",
42
+ hash_funcs={
43
+ "mlcroissant.Metadata": hash,
44
+ "mlcroissant.Field": hash,
45
+ "mlcroissant.FileObject": hash,
46
+ "mlcroissant.FileSet": hash,
47
+ "mlcroissant.RecordSet": hash,
48
+ },
49
+ )
50
  def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
51
  """Generates the data and waits at most _TIMEOUT_SECONDS."""
52
  with multiprocessing.Manager() as manager:
 
71
  """Generates the first _NUM_RECORDS records."""
72
  try:
73
  metadata: Metadata = st.session_state[Metadata]
74
+ if metadata is None:
75
  raise ValueError(
76
  "The dataset is still incomplete. Please, go to the overview to see"
77
  " errors."
 
93
  pass
94
  df.append(record)
95
  result["df"] = pd.DataFrame(df)
96
+ except Exception:
97
+ result["exception"] = traceback.format_exc()
98
 
99
 
100
  def _handle_close_fields():
 
160
  metadata.add_record_set(RecordSet(name="new-record-set", description=""))
161
 
162
 
163
+ def _handle_remove_record_set(record_set_key: int):
164
+ del st.session_state[Metadata].record_sets[record_set_key]
165
+
166
+
167
  def _handle_fields_change(record_set_key: int, record_set: RecordSet):
168
  expand_record_set(record_set=record_set)
169
  data_editor_key = _data_editor_key(record_set_key, record_set)
 
182
  elif new_field == FieldDataFrame.DESCRIPTION:
183
  field.description = new_value
184
  elif new_field == FieldDataFrame.DATA_TYPE:
185
+ field.data_types = [str_to_mlc_data_type(new_value)]
186
  for added_row in result["added_rows"]:
187
+ data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
188
  field = Field(
189
  name=added_row.get(FieldDataFrame.NAME),
190
  description=added_row.get(FieldDataFrame.DESCRIPTION),
191
+ data_types=[data_type],
192
  source=mlc.Source(),
193
  references=mlc.Source(),
194
  )
 
307
  # TODO(https://github.com/mlcommons/croissant/issues/350): Allow to display
308
  # several data types, not only the first.
309
  data_types = [
310
+ mlc_to_str_data_type(field.data_types[0]) if field.data_types else None
311
  for field in record_set.fields
312
  ]
313
  fields = pd.DataFrame(
 
342
  FieldDataFrame.DATA_TYPE: st.column_config.SelectboxColumn(
343
  FieldDataFrame.DATA_TYPE,
344
  help="The Croissant type",
345
+ options=STR_DATA_TYPES,
346
  required=True,
347
  ),
348
  },
 
376
  on_click=_handle_on_click_field,
377
  args=(record_set_key, record_set),
378
  )
379
+ key = f"{prefix}-delete-record-set"
380
+ st.button(
381
+ "⚠️ Delete RecordSet",
382
+ type="primary",
383
+ key=key,
384
+ on_click=_handle_remove_record_set,
385
+ args=(record_set_key,),
386
+ )
387
  st.button(
388
  "Create a new RecordSet",
389
  key=f"create-new-record-set",
 
451
  data_type = field.data_types[0]
452
  if isinstance(data_type, str):
453
  data_type = term.URIRef(data_type)
454
+ if data_type in MLC_DATA_TYPES:
455
+ data_type_index = MLC_DATA_TYPES.index(data_type)
456
  else:
457
  data_type_index = None
458
  else:
 
461
  col3.selectbox(
462
  needed_field("Data type"),
463
  index=data_type_index,
464
+ options=STR_DATA_TYPES,
465
  key=key,
466
  on_change=handle_field_change,
467
  args=(FieldEvent.DATA_TYPE, field, key),