marcenacp commited on
Commit
8c11dd4
1 Parent(s): f374b33

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
app.py CHANGED
@@ -7,7 +7,7 @@ from core.constants import OAUTH_STATE
7
  from core.constants import REDIRECT_URI
8
  from core.query_params import get_project_timestamp
9
  from core.state import CurrentProject
10
- from core.state import get_cached_user
11
  from core.state import User
12
  from utils import init_state
13
  from views.splash import render_splash
@@ -19,7 +19,7 @@ col1.header("Croissant Editor")
19
 
20
  init_state()
21
 
22
- user = get_cached_user()
23
 
24
  if OAUTH_CLIENT_ID and not user:
25
  query_params = st.experimental_get_query_params()
@@ -31,8 +31,7 @@ if OAUTH_CLIENT_ID and not user:
31
  try:
32
  st.session_state[User] = User.connect(code)
33
  # Clear the cache to force retrieving the new user.
34
- get_cached_user.clear()
35
- get_cached_user()
36
  except:
37
  raise
38
  finally:
@@ -56,7 +55,6 @@ def _back_to_menu():
56
  def _logout():
57
  """Logs the user out."""
58
  st.cache_data.clear()
59
- get_cached_user.clear()
60
  st.session_state[User] = None
61
  _back_to_menu()
62
 
 
7
  from core.constants import REDIRECT_URI
8
  from core.query_params import get_project_timestamp
9
  from core.state import CurrentProject
10
+ from core.state import get_user
11
  from core.state import User
12
  from utils import init_state
13
  from views.splash import render_splash
 
19
 
20
  init_state()
21
 
22
+ user = get_user()
23
 
24
  if OAUTH_CLIENT_ID and not user:
25
  query_params = st.experimental_get_query_params()
 
31
  try:
32
  st.session_state[User] = User.connect(code)
33
  # Clear the cache to force retrieving the new user.
34
+ get_user()
 
35
  except:
36
  raise
37
  finally:
 
55
  def _logout():
56
  """Logs the user out."""
57
  st.cache_data.clear()
 
58
  st.session_state[User] = None
59
  _back_to_menu()
60
 
core/constants.py CHANGED
@@ -35,8 +35,3 @@ METADATA = "Metadata"
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
38
-
39
- NAMES_INFO = (
40
- "Names are used as identifiers. They are unique and cannot contain special"
41
- " characters. The interface will replace any special characters."
42
- )
 
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
 
 
 
 
 
core/names.py CHANGED
@@ -1,13 +1,8 @@
1
  """Module to handle naming of RecordSets and distribution."""
2
 
3
- import re
4
-
5
- NAME_PATTERN_REGEX = "[^a-zA-Z0-9\\-_\\.]"
6
-
7
 
8
  def find_unique_name(names: set[str], name: str):
9
  """Find a unique UID."""
10
- name = re.sub(NAME_PATTERN_REGEX, "_", name)
11
  while name in names:
12
  name = f"{name}_0"
13
  return name
 
1
  """Module to handle naming of RecordSets and distribution."""
2
 
 
 
 
 
3
 
4
  def find_unique_name(names: set[str], name: str):
5
  """Find a unique UID."""
 
6
  while name in names:
7
  name = f"{name}_0"
8
  return name
core/names_test.py CHANGED
@@ -5,7 +5,6 @@ from .names import find_unique_name
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
8
- assert find_unique_name(names, "are there spaces") == "are_there_spaces"
9
  assert find_unique_name(names, "first") == "first_0_0"
10
  assert find_unique_name(names, "second") == "second_0"
11
  assert find_unique_name(names, "third") == "third"
 
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
 
8
  assert find_unique_name(names, "first") == "first_0_0"
9
  assert find_unique_name(names, "second") == "second_0"
10
  assert find_unique_name(names, "third") == "third"
core/past_projects.py CHANGED
@@ -8,12 +8,12 @@ from core.constants import PAST_PROJECTS_PATH
8
  from core.query_params import set_project
9
  from core.state import CurrentProject
10
  from core.state import FileObject
11
- from core.state import get_cached_user
12
  from core.state import Metadata
13
 
14
 
15
  def load_past_projects_paths() -> list[epath.Path]:
16
- user = get_cached_user()
17
  past_projects_path = PAST_PROJECTS_PATH(user)
18
  past_projects_path.mkdir(parents=True, exist_ok=True)
19
  return sorted(list(past_projects_path.iterdir()), reverse=True)
 
8
  from core.query_params import set_project
9
  from core.state import CurrentProject
10
  from core.state import FileObject
11
+ from core.state import get_user
12
  from core.state import Metadata
13
 
14
 
15
  def load_past_projects_paths() -> list[epath.Path]:
16
+ user = get_user()
17
  past_projects_path = PAST_PROJECTS_PATH(user)
18
  past_projects_path.mkdir(parents=True, exist_ok=True)
19
  return sorted(list(past_projects_path.iterdir()), reverse=True)
core/state.py CHANGED
@@ -83,9 +83,8 @@ class User:
83
  )
84
 
85
 
86
- @st.cache_data(ttl=datetime.timedelta(hours=1))
87
- def get_cached_user():
88
- """Caches user in session_state."""
89
  return st.session_state.get(User)
90
 
91
 
@@ -102,7 +101,7 @@ class CurrentProject:
102
 
103
  @classmethod
104
  def from_timestamp(cls, timestamp: str) -> CurrentProject | None:
105
- user = get_cached_user()
106
  if user is None and OAUTH_CLIENT_ID:
107
  return None
108
  else:
 
83
  )
84
 
85
 
86
+ def get_user():
87
+ """Get user from session_state."""
 
88
  return st.session_state.get(User)
89
 
90
 
 
101
 
102
  @classmethod
103
  def from_timestamp(cls, timestamp: str) -> CurrentProject | None:
104
+ user = get_user()
105
  if user is None and OAUTH_CLIENT_ID:
106
  return None
107
  else:
events/metadata.py CHANGED
@@ -2,7 +2,6 @@ import enum
2
 
3
  import streamlit as st
4
 
5
- from core.names import find_unique_name
6
  from core.state import Metadata
7
 
8
  # List from:
@@ -98,7 +97,7 @@ class MetadataEvent(enum.Enum):
98
 
99
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
100
  if event == MetadataEvent.NAME:
101
- metadata.name = find_unique_name(set(), st.session_state[key])
102
  elif event == MetadataEvent.DESCRIPTION:
103
  metadata.description = st.session_state[key]
104
  elif event == MetadataEvent.LICENSE:
 
2
 
3
  import streamlit as st
4
 
 
5
  from core.state import Metadata
6
 
7
  # List from:
 
97
 
98
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
99
  if event == MetadataEvent.NAME:
100
+ metadata.name = st.session_state[key]
101
  elif event == MetadataEvent.DESCRIPTION:
102
  metadata.description = st.session_state[key]
103
  elif event == MetadataEvent.LICENSE:
views/files.py CHANGED
@@ -3,7 +3,6 @@ import streamlit as st
3
  from components.safe_button import button_with_confirmation
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
6
- from core.constants import NAMES_INFO
7
  from core.constants import OAUTH_CLIENT_ID
8
  from core.files import code_to_index
9
  from core.files import file_from_form
@@ -203,11 +202,6 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
203
  default=file.contained_in,
204
  options=parent_options,
205
  key=key,
206
- help=(
207
- "FileObjects and FileSets can be nested. Specifying `Parents` allows to"
208
- " nest a FileObject/FileSet within another FileObject/FileSet. An example"
209
- " of this is when images (FileSet) are nested within an archive (FileSet)."
210
- ),
211
  on_change=handle_resource_change,
212
  args=(ResourceEvent.CONTAINED_IN, file, key),
213
  )
@@ -216,7 +210,6 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
216
  needed_field("Name"),
217
  value=file.name,
218
  key=key,
219
- help=f"The name of the resource. {NAMES_INFO}",
220
  on_change=handle_resource_change,
221
  args=(ResourceEvent.NAME, file, key),
222
  )
@@ -232,10 +225,9 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
232
  if is_file_object:
233
  key = f"{prefix}_content_url"
234
  st.text_input(
235
- needed_field("Content URL or local path"),
236
  value=file.content_url,
237
  key=key,
238
- help="The URL or local file path pointing to the original FileObject.",
239
  on_change=handle_resource_change,
240
  args=(ResourceEvent.CONTENT_URL, file, key),
241
  )
@@ -252,7 +244,6 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
252
  "Content size",
253
  value=file.content_size,
254
  key=key,
255
- help="The size of the original FileObject in bytes.",
256
  on_change=handle_resource_change,
257
  args=(ResourceEvent.CONTENT_SIZE, file, key),
258
  )
@@ -271,10 +262,6 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
271
  index=code_to_index(file.encoding_format),
272
  options=FILE_TYPES.keys(),
273
  key=key,
274
- help=(
275
- "MIME type corresponding to"
276
- " ([sc:encodingFormat](https://schema.org/encodingFormat))."
277
- ),
278
  on_change=handle_resource_change,
279
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
280
  )
 
3
  from components.safe_button import button_with_confirmation
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
 
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.files import code_to_index
8
  from core.files import file_from_form
 
202
  default=file.contained_in,
203
  options=parent_options,
204
  key=key,
 
 
 
 
 
205
  on_change=handle_resource_change,
206
  args=(ResourceEvent.CONTAINED_IN, file, key),
207
  )
 
210
  needed_field("Name"),
211
  value=file.name,
212
  key=key,
 
213
  on_change=handle_resource_change,
214
  args=(ResourceEvent.NAME, file, key),
215
  )
 
225
  if is_file_object:
226
  key = f"{prefix}_content_url"
227
  st.text_input(
228
+ needed_field("Content URL"),
229
  value=file.content_url,
230
  key=key,
 
231
  on_change=handle_resource_change,
232
  args=(ResourceEvent.CONTENT_URL, file, key),
233
  )
 
244
  "Content size",
245
  value=file.content_size,
246
  key=key,
 
247
  on_change=handle_resource_change,
248
  args=(ResourceEvent.CONTENT_SIZE, file, key),
249
  )
 
262
  index=code_to_index(file.encoding_format),
263
  options=FILE_TYPES.keys(),
264
  key=key,
 
 
 
 
265
  on_change=handle_resource_change,
266
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
267
  )
views/overview.py CHANGED
@@ -3,7 +3,6 @@ from typing import Any
3
 
4
  import streamlit as st
5
 
6
- from core.constants import NAMES_INFO
7
  from core.state import Metadata
8
  import mlcroissant as mlc
9
  from utils import needed_field
@@ -52,7 +51,6 @@ def render_overview():
52
  label=needed_field("Name"),
53
  key=key,
54
  value=metadata.name,
55
- help=f"The name of the dataset. {NAMES_INFO}",
56
  placeholder="Dataset",
57
  on_change=handle_metadata_change,
58
  args=(MetadataEvent.NAME, metadata, key),
@@ -84,14 +82,7 @@ def render_overview():
84
  * 100
85
  / (3 * metadata_weight)
86
  )
87
- col_a.metric(
88
- "Completion",
89
- f"{completion}%",
90
- help=(
91
- "Approximation of the total completion based on the number of fields"
92
- " that are filled."
93
- ),
94
- )
95
  col_b.metric("Number of metadata fields", fields)
96
  col_c.metric("Number of resources", len(metadata.distribution))
97
  col_d.metric("Number of RecordSets", len(metadata.record_sets))
 
3
 
4
  import streamlit as st
5
 
 
6
  from core.state import Metadata
7
  import mlcroissant as mlc
8
  from utils import needed_field
 
51
  label=needed_field("Name"),
52
  key=key,
53
  value=metadata.name,
 
54
  placeholder="Dataset",
55
  on_change=handle_metadata_change,
56
  args=(MetadataEvent.NAME, metadata, key),
 
82
  * 100
83
  / (3 * metadata_weight)
84
  )
85
+ col_a.metric("Completion", f"{completion}%")
 
 
 
 
 
 
 
86
  col_b.metric("Number of metadata fields", fields)
87
  col_c.metric("Number of resources", len(metadata.distribution))
88
  col_d.metric("Number of RecordSets", len(metadata.record_sets))
views/record_sets.py CHANGED
@@ -10,7 +10,6 @@ from rdflib import term
10
  import streamlit as st
11
 
12
  from components.safe_button import button_with_confirmation
13
- from core.constants import NAMES_INFO
14
  from core.data_types import MLC_DATA_TYPES
15
  from core.data_types import mlc_to_str_data_type
16
  from core.data_types import STR_DATA_TYPES
@@ -241,7 +240,6 @@ def _render_left_panel():
241
  needed_field("Name"),
242
  placeholder="Name without special character.",
243
  key=key,
244
- help=f"The name of the RecordSet. {NAMES_INFO}",
245
  value=record_set.name,
246
  on_change=handle_record_set_change,
247
  args=(RecordSetEvent.NAME, record_set, key),
@@ -259,13 +257,6 @@ def _render_left_panel():
259
  st.checkbox(
260
  "The RecordSet is an enumeration",
261
  key=key,
262
- help=(
263
- "Enumerations indicate that the RecordSet takes its values from a"
264
- " finite set. Similar to `ClassLabel` in"
265
- " [TFDS](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/ClassLabel)"
266
- " or [Hugging"
267
- " Face](https://huggingface.co/docs/datasets/v2.15.0/en/package_reference/main_classes#datasets.ClassLabel)."
268
- ),
269
  value=record_set.is_enumeration,
270
  on_change=handle_record_set_change,
271
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
@@ -274,10 +265,6 @@ def _render_left_panel():
274
  st.checkbox(
275
  "The RecordSet has in-line data",
276
  key=key,
277
- help=(
278
- "In-line data allows to embed data directly within the JSON-LD"
279
- " without referencing another data source."
280
- ),
281
  value=bool(record_set.data),
282
  on_change=handle_record_set_change,
283
  args=(RecordSetEvent.HAS_DATA, record_set, key),
@@ -337,14 +324,8 @@ def _render_left_panel():
337
  )
338
  data_editor_key = _data_editor_key(record_set_key, record_set)
339
  st.markdown(
340
- needed_field("Fields"),
341
- help=(
342
- "Add/delete fields by directly editing the table. **Warning**: the"
343
- " table contains information about the fields--not the data"
344
- " directly. If you wish to embed data, tick the `The RecordSet is"
345
- " an enumeration` box. To edit fields details, click the"
346
- " button `Edit fields details` below."
347
- ),
348
  )
349
  st.data_editor(
350
  fields,
@@ -456,7 +437,6 @@ def _render_right_panel():
456
  needed_field("Name"),
457
  placeholder="Name without special character.",
458
  key=key,
459
- help=f"The name of the field. {NAMES_INFO}",
460
  value=field.name,
461
  on_change=handle_field_change,
462
  args=(FieldEvent.NAME, field, key),
@@ -470,29 +450,32 @@ def _render_right_panel():
470
  value=field.description,
471
  args=(FieldEvent.DESCRIPTION, field, key),
472
  )
473
- data_type_index = None
474
  if field.data_types:
475
  data_type = field.data_types[0]
476
  if isinstance(data_type, str):
477
  data_type = term.URIRef(data_type)
478
  if data_type in MLC_DATA_TYPES:
479
  data_type_index = MLC_DATA_TYPES.index(data_type)
 
 
 
 
480
  key = f"{prefix}-datatypes"
481
  col3.selectbox(
482
  needed_field("Data type"),
483
  index=data_type_index,
484
  options=STR_DATA_TYPES,
485
  key=key,
486
- help=(
487
- "The type of the data. `Text` corresponds to"
488
- " https://schema.org/Text, etc."
489
- ),
490
  on_change=handle_field_change,
491
  args=(FieldEvent.DATA_TYPE, field, key),
492
  )
493
  possible_sources = _get_possible_sources(metadata)
494
- render_source(record_set, field, possible_sources)
495
- render_references(record_set, field, possible_sources)
 
 
 
 
496
 
497
  st.divider()
498
 
 
10
  import streamlit as st
11
 
12
  from components.safe_button import button_with_confirmation
 
13
  from core.data_types import MLC_DATA_TYPES
14
  from core.data_types import mlc_to_str_data_type
15
  from core.data_types import STR_DATA_TYPES
 
240
  needed_field("Name"),
241
  placeholder="Name without special character.",
242
  key=key,
 
243
  value=record_set.name,
244
  on_change=handle_record_set_change,
245
  args=(RecordSetEvent.NAME, record_set, key),
 
257
  st.checkbox(
258
  "The RecordSet is an enumeration",
259
  key=key,
 
 
 
 
 
 
 
260
  value=record_set.is_enumeration,
261
  on_change=handle_record_set_change,
262
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
 
265
  st.checkbox(
266
  "The RecordSet has in-line data",
267
  key=key,
 
 
 
 
268
  value=bool(record_set.data),
269
  on_change=handle_record_set_change,
270
  args=(RecordSetEvent.HAS_DATA, record_set, key),
 
324
  )
325
  data_editor_key = _data_editor_key(record_set_key, record_set)
326
  st.markdown(
327
+ f"{needed_field('Fields')} (add/delete fields by directly editing the"
328
+ " table)"
 
 
 
 
 
 
329
  )
330
  st.data_editor(
331
  fields,
 
437
  needed_field("Name"),
438
  placeholder="Name without special character.",
439
  key=key,
 
440
  value=field.name,
441
  on_change=handle_field_change,
442
  args=(FieldEvent.NAME, field, key),
 
450
  value=field.description,
451
  args=(FieldEvent.DESCRIPTION, field, key),
452
  )
 
453
  if field.data_types:
454
  data_type = field.data_types[0]
455
  if isinstance(data_type, str):
456
  data_type = term.URIRef(data_type)
457
  if data_type in MLC_DATA_TYPES:
458
  data_type_index = MLC_DATA_TYPES.index(data_type)
459
+ else:
460
+ data_type_index = None
461
+ else:
462
+ data_type_index = None
463
  key = f"{prefix}-datatypes"
464
  col3.selectbox(
465
  needed_field("Data type"),
466
  index=data_type_index,
467
  options=STR_DATA_TYPES,
468
  key=key,
 
 
 
 
469
  on_change=handle_field_change,
470
  args=(FieldEvent.DATA_TYPE, field, key),
471
  )
472
  possible_sources = _get_possible_sources(metadata)
473
+ render_source(
474
+ record_set_key, record_set, field, field_key, possible_sources
475
+ )
476
+ render_references(
477
+ record_set_key, record_set, field, field_key, possible_sources
478
+ )
479
 
480
  st.divider()
481
 
views/source.py CHANGED
@@ -12,15 +12,6 @@ from events.fields import TransformType
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
15
- _JSON_PATH_DOCUMENTATION = (
16
- "The JSON path if the data source is a JSON (see"
17
- " [documentation](https://www.ietf.org/archive/id/draft-goessner-dispatch-jsonpath-00.html))."
18
- )
19
- _EXTRACT_DOCUMENTATION = (
20
- "The extraction method to get the value of the field (column in a CSV, etc)."
21
- )
22
- _COLUMN_NAME_DOCUMENTATION = "The name of the column if the data source is a CSV."
23
-
24
 
25
  class SourceType:
26
  """The type of the source (distribution or field)."""
@@ -114,8 +105,10 @@ def _handle_remove_reference(field):
114
 
115
 
116
  def render_source(
 
117
  record_set: RecordSet,
118
  field: Field,
 
119
  possible_sources: list[str],
120
  ):
121
  """Renders the form for the source."""
@@ -130,13 +123,10 @@ def render_source(
130
  index = None
131
  key = f"{prefix}-source"
132
  col1.selectbox(
133
- needed_field("Data source"),
134
  index=index,
135
  options=options,
136
  key=key,
137
- help=(
138
- "Data sources can be other resources (FileObject, FileSet) or other fields."
139
- ),
140
  on_change=handle_field_change,
141
  args=(FieldEvent.SOURCE, field, key),
142
  )
@@ -145,7 +135,6 @@ def render_source(
145
  needed_field("Extract"),
146
  index=_get_extract_index(source),
147
  key=f"{prefix}-extract",
148
- help=_EXTRACT_DOCUMENTATION,
149
  options=EXTRACT_TYPES,
150
  on_change=handle_field_change,
151
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
@@ -156,7 +145,6 @@ def render_source(
156
  needed_field("Column name"),
157
  value=source.extract.column,
158
  key=key,
159
- help=_COLUMN_NAME_DOCUMENTATION,
160
  on_change=handle_field_change,
161
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
162
  )
@@ -166,7 +154,6 @@ def render_source(
166
  needed_field("JSON path"),
167
  value=source.extract.json_path,
168
  key=key,
169
- help=_JSON_PATH_DOCUMENTATION,
170
  on_change=handle_field_change,
171
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
172
  )
@@ -183,23 +170,18 @@ def render_source(
183
  key=key,
184
  options=TRANSFORM_TYPES,
185
  on_change=handle_field_change,
186
- help="One or more transformations to apply after extracting the field.",
187
  args=(FieldEvent.TRANSFORM, field, key),
188
  kwargs={"number": number},
189
  )
190
  if selected == TransformType.FORMAT:
191
  key = f"{prefix}-{number}-transform-format"
192
  col3.text_input(
193
- needed_field("Format a date"),
194
  value=transform.format,
195
  key=key,
196
  on_change=handle_field_change,
197
- help=(
198
- "For dates, use [`Python format"
199
- " codes`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)."
200
- ),
201
  args=(selected, field, key),
202
- kwargs={"number": number},
203
  )
204
  elif selected == TransformType.JSON_PATH:
205
  key = f"{prefix}-{number}-jsonpath"
@@ -208,9 +190,8 @@ def render_source(
208
  value=transform.json_path,
209
  key=key,
210
  on_change=handle_field_change,
211
- help=_JSON_PATH_DOCUMENTATION,
212
  args=(selected, field, key),
213
- kwargs={"number": number},
214
  )
215
  elif selected == TransformType.REGEX:
216
  key = f"{prefix}-{number}-regex"
@@ -219,14 +200,8 @@ def render_source(
219
  value=transform.regex,
220
  key=key,
221
  on_change=handle_field_change,
222
- help=(
223
- "A regular expression following [`re` Python"
224
- " convention](https://docs.python.org/3/library/re.html#regular-expression-syntax)"
225
- " with one capturing group. The result of the operation will be"
226
- " the last captured group."
227
- ),
228
  args=(selected, field, key),
229
- kwargs={"number": number},
230
  )
231
  elif selected == TransformType.REPLACE:
232
  key = f"{prefix}-{number}-replace"
@@ -235,13 +210,8 @@ def render_source(
235
  value=transform.replace,
236
  key=key,
237
  on_change=handle_field_change,
238
- help=(
239
- "A replace pattern separated by a `/`, i.e."
240
- " `string_to_replace/string_to_substitute` in order to replace"
241
- " `string_to_replace` by `string_to_substitute`."
242
- ),
243
  args=(selected, field, key),
244
- kwargs={"number": number},
245
  )
246
  elif selected == TransformType.SEPARATOR:
247
  key = f"{prefix}-{number}-separator"
@@ -250,9 +220,8 @@ def render_source(
250
  value=transform.separator,
251
  key=key,
252
  on_change=handle_field_change,
253
- help="A separator to split strings on, e.g. `|` to split `a|b|c`.",
254
  args=(selected, field, key),
255
- kwargs={"number": number},
256
  )
257
 
258
  def _handle_remove_transform(field, number):
@@ -261,7 +230,6 @@ def render_source(
261
  col4.button(
262
  "✖️",
263
  key=f"{prefix}-{number}-remove-transform",
264
- help="Remove the transformation.",
265
  on_click=_handle_remove_transform,
266
  args=(field, number),
267
  )
@@ -275,15 +243,16 @@ def render_source(
275
  col1.button(
276
  "Add transform on data",
277
  key=f"{prefix}-close-fields",
278
- help="Add a transformation.",
279
  on_click=_handle_add_transform,
280
  args=(field,),
281
  )
282
 
283
 
284
  def render_references(
 
285
  record_set: RecordSet,
286
  field: Field,
 
287
  possible_sources: list[str],
288
  ):
289
  """Renders the form for references."""
@@ -317,7 +286,6 @@ def render_references(
317
  index=_get_extract_index(references),
318
  key=key,
319
  options=EXTRACT_TYPES,
320
- help=_EXTRACT_DOCUMENTATION,
321
  on_change=handle_field_change,
322
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
323
  )
@@ -327,7 +295,6 @@ def render_references(
327
  needed_field("Column name"),
328
  value=references.extract.column,
329
  key=key,
330
- help=_COLUMN_NAME_DOCUMENTATION,
331
  on_change=handle_field_change,
332
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
333
  )
@@ -337,14 +304,12 @@ def render_references(
337
  needed_field("JSON path"),
338
  value=references.extract.json_path,
339
  key=key,
340
- help=_JSON_PATH_DOCUMENTATION,
341
  on_change=handle_field_change,
342
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
343
  )
344
  col4.button(
345
  "✖️",
346
  key=f"{key}-remove-reference",
347
- help="Remove the join.",
348
  on_click=_handle_remove_reference,
349
  args=(field,),
350
  )
 
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
 
 
 
 
 
 
 
 
 
15
 
16
  class SourceType:
17
  """The type of the source (distribution or field)."""
 
105
 
106
 
107
  def render_source(
108
+ record_set_key: int,
109
  record_set: RecordSet,
110
  field: Field,
111
+ field_key: int,
112
  possible_sources: list[str],
113
  ):
114
  """Renders the form for the source."""
 
123
  index = None
124
  key = f"{prefix}-source"
125
  col1.selectbox(
126
+ needed_field("Source"),
127
  index=index,
128
  options=options,
129
  key=key,
 
 
 
130
  on_change=handle_field_change,
131
  args=(FieldEvent.SOURCE, field, key),
132
  )
 
135
  needed_field("Extract"),
136
  index=_get_extract_index(source),
137
  key=f"{prefix}-extract",
 
138
  options=EXTRACT_TYPES,
139
  on_change=handle_field_change,
140
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
 
145
  needed_field("Column name"),
146
  value=source.extract.column,
147
  key=key,
 
148
  on_change=handle_field_change,
149
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
150
  )
 
154
  needed_field("JSON path"),
155
  value=source.extract.json_path,
156
  key=key,
 
157
  on_change=handle_field_change,
158
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
159
  )
 
170
  key=key,
171
  options=TRANSFORM_TYPES,
172
  on_change=handle_field_change,
 
173
  args=(FieldEvent.TRANSFORM, field, key),
174
  kwargs={"number": number},
175
  )
176
  if selected == TransformType.FORMAT:
177
  key = f"{prefix}-{number}-transform-format"
178
  col3.text_input(
179
+ needed_field("Format"),
180
  value=transform.format,
181
  key=key,
182
  on_change=handle_field_change,
 
 
 
 
183
  args=(selected, field, key),
184
+ kwargs={"number": number, "type": "format"},
185
  )
186
  elif selected == TransformType.JSON_PATH:
187
  key = f"{prefix}-{number}-jsonpath"
 
190
  value=transform.json_path,
191
  key=key,
192
  on_change=handle_field_change,
 
193
  args=(selected, field, key),
194
+ kwargs={"number": number, "type": "format"},
195
  )
196
  elif selected == TransformType.REGEX:
197
  key = f"{prefix}-{number}-regex"
 
200
  value=transform.regex,
201
  key=key,
202
  on_change=handle_field_change,
 
 
 
 
 
 
203
  args=(selected, field, key),
204
+ kwargs={"number": number, "type": "format"},
205
  )
206
  elif selected == TransformType.REPLACE:
207
  key = f"{prefix}-{number}-replace"
 
210
  value=transform.replace,
211
  key=key,
212
  on_change=handle_field_change,
 
 
 
 
 
213
  args=(selected, field, key),
214
+ kwargs={"number": number, "type": "format"},
215
  )
216
  elif selected == TransformType.SEPARATOR:
217
  key = f"{prefix}-{number}-separator"
 
220
  value=transform.separator,
221
  key=key,
222
  on_change=handle_field_change,
 
223
  args=(selected, field, key),
224
+ kwargs={"number": number, "type": "format"},
225
  )
226
 
227
  def _handle_remove_transform(field, number):
 
230
  col4.button(
231
  "✖️",
232
  key=f"{prefix}-{number}-remove-transform",
 
233
  on_click=_handle_remove_transform,
234
  args=(field, number),
235
  )
 
243
  col1.button(
244
  "Add transform on data",
245
  key=f"{prefix}-close-fields",
 
246
  on_click=_handle_add_transform,
247
  args=(field,),
248
  )
249
 
250
 
251
  def render_references(
252
+ record_set_key: int,
253
  record_set: RecordSet,
254
  field: Field,
255
+ field_key: int,
256
  possible_sources: list[str],
257
  ):
258
  """Renders the form for references."""
 
286
  index=_get_extract_index(references),
287
  key=key,
288
  options=EXTRACT_TYPES,
 
289
  on_change=handle_field_change,
290
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
291
  )
 
295
  needed_field("Column name"),
296
  value=references.extract.column,
297
  key=key,
 
298
  on_change=handle_field_change,
299
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
300
  )
 
304
  needed_field("JSON path"),
305
  value=references.extract.json_path,
306
  key=key,
 
307
  on_change=handle_field_change,
308
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
309
  )
310
  col4.button(
311
  "✖️",
312
  key=f"{key}-remove-reference",
 
313
  on_click=_handle_remove_reference,
314
  args=(field,),
315
  )
views/splash.py CHANGED
@@ -13,8 +13,6 @@ import mlcroissant as mlc
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
16
- _HUGGING_FACE_URL = "https://huggingface.co/datasets/"
17
-
18
  _DATASETS = {
19
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
20
  "FLORES-200": [],
@@ -36,7 +34,9 @@ def render_splash():
36
  )
37
  col1, col2 = st.columns([1, 1], gap="large")
38
  with col1:
39
- with st.expander("**Create from scratch!**", expanded=True):
 
 
40
 
41
  def create_new_croissant():
42
  st.session_state[Metadata] = Metadata()
@@ -81,37 +81,6 @@ def render_splash():
81
  type="primary",
82
  args=(dataset,),
83
  )
84
- with st.expander("**Load a dataset from Hugging Face!**", expanded=True):
85
- url = st.text_input(
86
- label="Hugging Face URL",
87
- placeholder="https://huggingface.co/datasets/mnist",
88
- )
89
- if url.startswith(_HUGGING_FACE_URL):
90
- url = url.replace(_HUGGING_FACE_URL, "")
91
-
92
- def download_huggingface_json(name: str):
93
- api_url = f"https://datasets-server.huggingface.co/croissant?dataset={name}"
94
- json = requests.get(api_url, headers=None).json()
95
- try:
96
- metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
97
- st.session_state[Metadata] = Metadata.from_canonical(metadata)
98
- save_current_project()
99
- except Exception:
100
- st.error(f"Malformed JSON: {json}")
101
-
102
- st.button(
103
- f'Download "{url}"',
104
- on_click=download_huggingface_json,
105
- type="primary",
106
- args=(url,),
107
- )
108
- elif url:
109
- st.error(
110
- f"Unknown URL {url}. Hugging Face URLS should look like"
111
- f" {_HUGGING_FACE_URL}somedataset."
112
- )
113
- with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
114
- render_load()
115
  with col2:
116
  with st.expander("**Past projects**", expanded=True):
117
  render_previous_files()
 
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
 
 
16
  _DATASETS = {
17
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
18
  "FLORES-200": [],
 
34
  )
35
  col1, col2 = st.columns([1, 1], gap="large")
36
  with col1:
37
+ with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
38
+ render_load()
39
+ with st.expander("**Create from scratch**", expanded=True):
40
 
41
  def create_new_croissant():
42
  st.session_state[Metadata] = Metadata()
 
81
  type="primary",
82
  args=(dataset,),
83
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with col2:
85
  with st.expander("**Past projects**", expanded=True):
86
  render_previous_files()