Spaces:

MLCommons
/

croissant-editor

Running

App Files Files Community

marcenacp commited on Nov 23, 2023

Commit

cb5b71d

1 Parent(s): c3ac09f

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.streamlit/config.toml +9 -0
Makefile +26 -0
__init__.py +0 -0
app.py +29 -0
components/__init__.py +0 -0
components/tree/__init__.py +36 -0
components/tree/frontend/.env +6 -0
components/tree/frontend/.prettierrc +5 -0
components/tree/frontend/build/asset-manifest.json +10 -0
components/tree/frontend/build/index.html +1 -0
components/tree/frontend/build/static/js/main.5a572f5d.js +0 -0
components/tree/frontend/build/static/js/main.5a572f5d.js.LICENSE.txt +63 -0
components/tree/frontend/build/static/js/main.5a572f5d.js.map +0 -0
components/tree/frontend/package-lock.json +0 -0
components/tree/frontend/package.json +47 -0
components/tree/frontend/public/index.html +27 -0
components/tree/frontend/src/Tree.tsx +215 -0
components/tree/frontend/src/index.tsx +10 -0
components/tree/frontend/src/react-app-env.d.ts +1 -0
components/tree/frontend/tsconfig.json +25 -0
core/__init__.py +0 -0
core/constants.py +10 -0
core/data_types.py +19 -0
core/data_types_test.py +15 -0
core/files.py +154 -0
core/files_test.py +27 -0
core/names.py +8 -0
core/names_test.py +10 -0
core/past_projects.py +34 -0
core/record_sets.py +38 -0
core/state.py +261 -0
cypress.config.js +7 -0
cypress/downloads/croissant-Titanic.json +1 -0
cypress/downloads/croissant.json +1 -0
cypress/e2e/createManually.cy.js +35 -0
cypress/e2e/displayErrors.cy.js +30 -0
cypress/e2e/loadCroissant.cy.js +61 -0
cypress/e2e/renameDistribution.cy.js +36 -0
cypress/e2e/uploadCsv.cy.js +59 -0
cypress/fixtures/base.csv +4 -0
cypress/fixtures/coco.json +409 -0
cypress/fixtures/titanic.json +343 -0
cypress/screenshots/uploadCsv.cy.js/Editor loads a local CSV as a resource -- should display the form Overview, Metadata, Resources, & Record Sets (failed).png +0 -0
cypress/support/e2e.js +6 -0
cypress/support/resize_observer.js +11 -0
events/__init__.py +0 -0
events/fields.py +147 -0
events/metadata.py +28 -0
events/record_sets.py +29 -0
events/resources.py +41 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,9 @@

+[browser]
+gatherUsageStats = false
+[theme]
+primaryColor = "#F29828"
+backgroundColor = "#CCEBD4"
+secondaryBackgroundColor = "#EEF2F9"
+textColor = "#171D30"
+font = "sans serif"

Makefile ADDED Viewed

	@@ -0,0 +1,26 @@

+black:
+	black \
+		--line-length 88 \
+		--preview \
+	.
+isort:
+	isort \
+		--profile google \
+		--line-length 88 \
+		--use-parentheses \
+		--project mlcroissant \
+		--project components \
+		--project core \
+		--project events \
+		--project views \
+		--project state \
+		--project utils \
+		--multi-line 3 \
+		--thirdparty datasets \
+	.
+format: black isort
+pytest:
+	PYTHONPATH=. pytest

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import streamlit as st
+from core.state import CurrentStep
+from utils import init_state
+from views.splash import render_splash
+from views.wizard import render_editor
+init_state()
+def _back_to_menu():
+    """Sends the user back to the menu."""
+    init_state(force=True)
+st.set_page_config(page_title="Croissant Editor", page_icon="🥐", layout="wide")
+col1, col2 = st.columns([10, 1])
+col1.header("Croissant Editor")
+if st.session_state[CurrentStep] != CurrentStep.splash:
+    col2.write("\n")  # Vertical box to shift the button menu
+    col2.button("Menu", on_click=_back_to_menu)
+if st.session_state[CurrentStep] == CurrentStep.splash:
+    render_splash()
+elif st.session_state[CurrentStep] == CurrentStep.editor:
+    render_editor()
+else:
+    st.warning("invalid unhandled app state")

components/__init__.py ADDED Viewed

File without changes

components/tree/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import streamlit.components.v1 as components
+# Create a _RELEASE constant. We'll set this to False while we're developing
+# the component, and True when we're ready to package and distribute it.
+_RELEASE = True
+if not _RELEASE:
+    _component_func = components.declare_component(
+        "tree_component",
+        url="http://localhost:3001",
+    )
+else:
+    parent_dir = os.path.dirname(os.path.abspath(__file__))
+    build_dir = os.path.join(parent_dir, "frontend/build")
+    _component_func = components.declare_component("tree_component", path=build_dir)
+def render_tree(nodes, key=None):
+    """Create a new instance of "tree_component".
+    Args:
+        nodes: The nodes to render in the tree. Nodes are dictionaries with keys `name`
+            (unique identifier), `type` and `parent` (referencing another name).
+        key: An optional key that uniquely identifies this component. If this is
+            None, and the component's arguments are changed, the component will
+            be re-mounted in the Streamlit frontend and lose its current state.
+    Returns:
+        The number of times the component's "Click Me" button has been clicked.
+            (This is the value passed to `Streamlit.setComponentValue` on the
+            frontend.)
+    """
+    component_value = _component_func(nodes=nodes, key=key, default=0)
+    return component_value

components/tree/frontend/.env ADDED Viewed

	@@ -0,0 +1,6 @@

+# Run the component's dev server on :3001
+# (The Streamlit dev server already runs on :3000)
+PORT=3001
+# Don't automatically open the web browser on `npm run start`.
+BROWSER=none

components/tree/frontend/.prettierrc ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "endOfLine": "lf",
+  "semi": false,
+  "trailingComma": "es5"
+}

components/tree/frontend/build/asset-manifest.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "files": {
+    "main.js": "./static/js/main.5a572f5d.js",
+    "index.html": "./index.html",
+    "main.5a572f5d.js.map": "./static/js/main.5a572f5d.js.map"
+  },
+  "entrypoints": [
+    "static/js/main.5a572f5d.js"
+  ]
+}

components/tree/frontend/build/index.html ADDED Viewed

	@@ -0,0 +1 @@

+ <!doctype html><html lang="en"><head><title>Streamlit Tree Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.5a572f5d.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

components/tree/frontend/build/static/js/main.5a572f5d.js ADDED Viewed

The diff for this file is too large to render. See raw diff

components/tree/frontend/build/static/js/main.5a572f5d.js.LICENSE.txt ADDED Viewed

	@@ -0,0 +1,63 @@

+/*
+object-assign
+(c) Sindre Sorhus
+@license MIT
+*/
+/**
+ * @license React
+ * react-dom.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react-jsx-runtime.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * react.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/**
+ * @license React
+ * scheduler.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/** @license React v16.13.1
+ * react-is.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/** @license React v16.14.0
+ * react.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */

components/tree/frontend/build/static/js/main.5a572f5d.js.map ADDED Viewed

The diff for this file is too large to render. See raw diff

components/tree/frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

components/tree/frontend/package.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "name": "tree_component",
+  "version": "0.1.0",
+  "private": true,
+  "dependencies": {
+    "@mui/icons-material": "^5.14.16",
+    "@mui/material": "^5.14.17",
+    "@mui/x-tree-view": "^6.17.0",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
+    "streamlit-component-lib": "^2.0.0"
+  },
+  "scripts": {
+    "start": "react-scripts start",
+    "build": "react-scripts build",
+    "test": "react-scripts test",
+    "eject": "react-scripts eject"
+  },
+  "eslintConfig": {
+    "extends": "react-app"
+  },
+  "browserslist": {
+    "production": [
+      ">0.2%",
+      "not dead",
+      "not op_mini all"
+    ],
+    "development": [
+      "last 1 chrome version",
+      "last 1 firefox version",
+      "last 1 safari version"
+    ]
+  },
+  "homepage": ".",
+  "devDependencies": {
+    "@types/node": "^20.9.0",
+    "@types/react": "^18.2.37",
+    "@types/react-dom": "^18.2.15",
+    "react-scripts": "^5.0.1",
+    "typescript": "^5.2.2"
+  },
+  "overrides": {
+    "react-scripts": {
+      "typescript": "^5"
+    }
+  }
+}

components/tree/frontend/public/index.html ADDED Viewed

	@@ -0,0 +1,27 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <title>Streamlit Tree Component</title>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta name="theme-color" content="#000000" />
+  <meta name="description" content="Streamlit Tree Component" />
+</head>
+<body>
+  <noscript>You need to enable JavaScript to run this app.</noscript>
+  <div id="root"></div>
+  <!--
+      This HTML file is a template.
+      If you open it directly in the browser, you will see an empty page.
+      You can add webfonts, meta tags, or analytics to this file.
+      The build step will place the bundled scripts into the <body> tag.
+      To begin the development, run `npm start` or `yarn start`.
+      To create a production bundle, use `npm run build` or `yarn build`.
+    -->
+</body>
+</html>

components/tree/frontend/src/Tree.tsx ADDED Viewed

	@@ -0,0 +1,215 @@

+import {
+  Streamlit,
+  StreamlitComponentBase,
+  withStreamlitConnection,
+} from "streamlit-component-lib"
+import React, { ReactNode } from "react"
+import { styled, useTheme } from "@mui/material/styles"
+import Box from "@mui/material/Box"
+import Typography from "@mui/material/Typography"
+import FileCopyIcon from "@mui/icons-material/FileCopy"
+import InsertDriveFileIcon from "@mui/icons-material/InsertDriveFile"
+import ArrowDropDownIcon from "@mui/icons-material/ArrowDropDown"
+import ArrowRightIcon from "@mui/icons-material/ArrowRight"
+import { SvgIconProps } from "@mui/material/SvgIcon"
+import { TreeView } from "@mui/x-tree-view/TreeView"
+import {
+  TreeItem,
+  TreeItemProps,
+  treeItemClasses,
+} from "@mui/x-tree-view/TreeItem"
+// All code related to the MUI tree component is taken from https://mui.com/x/react-tree-view.
+declare module "react" {
+  interface CSSProperties {
+    "--tree-view-color"?: string
+    "--tree-view-bg-color"?: string
+  }
+}
+type StyledTreeItemProps = TreeItemProps & {
+  bgColor?: string
+  bgColorForDarkMode?: string
+  color?: string
+  colorForDarkMode?: string
+  labelIcon: React.ElementType<SvgIconProps>
+  labelInfo?: string
+  labelText: string
+}
+const StyledTreeItemRoot = styled(TreeItem)(({ theme }) => ({
+  color: theme.palette.text.secondary,
+  [`& .${treeItemClasses.content}`]: {
+    color: theme.palette.text.secondary,
+    borderTopRightRadius: theme.spacing(2),
+    borderBottomRightRadius: theme.spacing(2),
+    paddingRight: theme.spacing(1),
+    fontWeight: theme.typography.fontWeightMedium,
+    "&.Mui-expanded": {
+      fontWeight: theme.typography.fontWeightRegular,
+    },
+    "&:hover": {
+      backgroundColor: theme.palette.action.hover,
+    },
+    "&.Mui-focused, &.Mui-selected, &.Mui-selected.Mui-focused": {
+      backgroundColor: `var(--tree-view-bg-color, ${theme.palette.action.selected})`,
+      color: "var(--tree-view-color)",
+    },
+    [`& .${treeItemClasses.label}`]: {
+      fontWeight: "inherit",
+      color: "inherit",
+    },
+  },
+  [`& .${treeItemClasses.group}`]: {
+    marginLeft: 0,
+    [`& .${treeItemClasses.content}`]: {
+      paddingLeft: theme.spacing(2),
+    },
+  },
+})) as unknown as typeof TreeItem
+const StyledTreeItem = React.forwardRef(function StyledTreeItem(
+  props: StyledTreeItemProps,
+  ref: React.Ref<HTMLLIElement>
+) {
+  const theme = useTheme()
+  const {
+    bgColor,
+    color,
+    labelIcon: LabelIcon,
+    labelInfo,
+    labelText,
+    colorForDarkMode,
+    bgColorForDarkMode,
+    ...other
+  } = props
+  const styleProps = {
+    "--tree-view-color":
+      theme.palette.mode !== "dark" ? color : colorForDarkMode,
+    "--tree-view-bg-color":
+      theme.palette.mode !== "dark" ? bgColor : bgColorForDarkMode,
+  }
+  return (
+    <StyledTreeItemRoot
+      label={
+        <Box
+          sx={{
+            display: "flex",
+            alignItems: "center",
+            p: 0.5,
+            pr: 0,
+          }}
+        >
+          <Box component={LabelIcon} color="inherit" sx={{ mr: 1 }} />
+          <Typography
+            data-testid="tree-element"
+            variant="body2"
+            sx={{
+              whiteSpace: "nowrap",
+              overflow: "hidden",
+              textOverflow: "ellipsis",
+              fontWeight: "inherit",
+              flexGrow: 1,
+            }}
+          >
+            {labelText}
+          </Typography>
+          <Typography variant="caption" color="inherit">
+            {labelInfo}
+          </Typography>
+        </Box>
+      }
+      style={styleProps}
+      {...other}
+      ref={ref}
+    />
+  )
+})
+type Node = {
+  name: string
+  type: string
+  parents: string[]
+}
+type TreeNodes = { [key: string]: TreeNode }
+type TreeNode = Node & {
+  children: string[]
+}
+const TreeNodeComponent = ({
+  treeNode,
+  treeNodes,
+}: {
+  treeNode: TreeNode
+  treeNodes: TreeNodes
+}) => {
+  const { children } = treeNode
+  const childrenNodes = children
+    .filter((child) => child in treeNodes)
+    .map((child) => treeNodes[child])
+  const labelIcon =
+    treeNode.type === "FileObject" ? InsertDriveFileIcon : FileCopyIcon
+  return (
+    <StyledTreeItem
+      onClick={() => Streamlit.setComponentValue(treeNode.name)}
+      nodeId={treeNode.name}
+      labelText={treeNode.name}
+      labelIcon={labelIcon}
+    >
+      {childrenNodes.map((childNode) => (
+        <TreeNodeComponent treeNode={childNode} treeNodes={treeNodes} />
+      ))}
+    </StyledTreeItem>
+  )
+}
+const TreeViewWithNodes = ({ nodes }: { nodes: Node[] }) => {
+  const treeNodes: TreeNodes = {}
+  nodes.forEach((node) => {
+    treeNodes[node.name] = { ...node, children: [] }
+  })
+  nodes.forEach((node) => {
+    node.parents.forEach((parent) => {
+      if (parent in treeNodes) {
+        treeNodes[parent].children.push(node.name)
+      }
+    })
+  })
+  return (
+    <TreeView
+      defaultCollapseIcon={<ArrowDropDownIcon />}
+      defaultExpandIcon={<ArrowRightIcon />}
+      defaultEndIcon={<div style={{ width: 24 }} />}
+      expanded={Object.keys(treeNodes)}
+      sx={{
+        flexGrow: 1,
+        margin: -1,
+        padding: 1,
+        border: "1px solid rgba(23, 29, 48, 0.2)",
+        borderRadius: "0.5rem",
+      }}
+    >
+      {Object.values(treeNodes).map((treeNode) => {
+        return (
+          treeNode.parents.length === 0 && (
+            <TreeNodeComponent treeNode={treeNode} treeNodes={treeNodes} />
+          )
+        )
+      })}
+    </TreeView>
+  )
+}
+class Tree extends StreamlitComponentBase<{}> {
+  public render = (): ReactNode => {
+    const nodes = this.props.args["nodes"]
+    return <TreeViewWithNodes nodes={nodes} />
+  }
+}
+export default withStreamlitConnection(Tree)

components/tree/frontend/src/index.tsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import React from "react"
+import ReactDOM from "react-dom"
+import Tree from "./Tree"
+ReactDOM.render(
+  <React.StrictMode>
+    <Tree />
+  </React.StrictMode>,
+  document.getElementById("root")
+)

components/tree/frontend/src/react-app-env.d.ts ADDED Viewed

	@@ -0,0 +1 @@


1	+ /// <reference types="react-scripts" />

components/tree/frontend/tsconfig.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "compilerOptions": {
+        "target": "es5",
+        "lib": [
+            "dom",
+            "dom.iterable",
+            "esnext"
+        ],
+        "allowJs": true,
+        "skipLibCheck": true,
+        "esModuleInterop": true,
+        "allowSyntheticDefaultImports": true,
+        "strict": true,
+        "forceConsistentCasingInFileNames": true,
+        "module": "esnext",
+        "moduleResolution": "node",
+        "resolveJsonModule": true,
+        "isolatedModules": true,
+        "noEmit": true,
+        "jsx": "react"
+    },
+    "include": [
+        "src"
+    ]
+}

core/__init__.py ADDED Viewed

File without changes

core/constants.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from etils import epath
+import mlcroissant as mlc
+EDITOR_CACHE: epath.Path = mlc.constants.CROISSANT_CACHE / "editor"
+PAST_PROJECTS_PATH: epath.Path = EDITOR_CACHE / "projects"
+PROJECT_FOLDER_PATTERN = "%Y%m%d%H%M%S%f"
+DF_HEIGHT = 150

core/data_types.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import Any
+import numpy as np
+import mlcroissant as mlc
+def convert_dtype(dtype: Any):
+    """Converts from NumPy/Pandas to Croissant data types."""
+    if dtype == np.int64:
+        return mlc.DataType.INTEGER
+    elif dtype == np.float64:
+        return mlc.DataType.FLOAT
+    elif dtype == np.bool_:
+        return mlc.DataType.BOOL
+    elif dtype == np.str_ or dtype == object:
+        return mlc.DataType.TEXT
+    else:
+        raise NotImplementedError(dtype)

core/data_types_test.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Tests for data_types."""
+import numpy as np
+import pytest
+from .data_types import convert_dtype
+def test_convert_dtype():
+    convert_dtype(np.int64) == "https://schema.org/Integer"
+    convert_dtype(np.float64) == "https://schema.org/Float"
+    convert_dtype(np.bool_) == "https://schema.org/Boolean"
+    convert_dtype(np.str_) == "https://schema.org/Text"
+    with pytest.raises(NotImplementedError):
+        convert_dtype(np.float32)

core/files.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import dataclasses
+import hashlib
+import io
+import tempfile
+from etils import epath
+import pandas as pd
+import requests
+from .names import find_unique_name
+from .state import FileObject
+from .state import FileSet
+FILE_OBJECT = "File object"
+FILE_SET = "File set"
+RESOURCE_TYPES = [FILE_OBJECT, FILE_SET]
+@dataclasses.dataclass
+class FileType:
+    name: str
+    encoding_format: str
+    extensions: list[str]
+class FileTypes:
+    CSV = FileType(name="CSV", encoding_format="text/csv", extensions=["csv"])
+    EXCEL = FileType(
+        name="Excel",
+        encoding_format="application/vnd.ms-excel",
+        extensions=["xls", "xlsx", "xlsm"],
+    )
+    JSON = FileType(
+        name="JSON", encoding_format="application/json", extensions=["json"]
+    )
+    JSONL = FileType(
+        name="JSON-Lines",
+        encoding_format="application/jsonl+json",
+        extensions=["jsonl"],
+    )
+    PARQUET = FileType(
+        name="Parquet",
+        encoding_format="application/vnd.apache.parquet",
+        extensions=["parquet"],
+    )
+FILE_TYPES: dict[str, FileType] = {
+    file_type.name: file_type
+    for file_type in [
+        FileTypes.CSV,
+        FileTypes.EXCEL,
+        FileTypes.JSON,
+        FileTypes.JSONL,
+        FileTypes.PARQUET,
+    ]
+}
+def _sha256(content: bytes):
+    """Computes the sha256 digest of the byte string."""
+    return hashlib.sha256(content).hexdigest()
+def hash_file_path(url: str) -> epath.Path:
+    """Reproducibly produces the file path."""
+    tempdir = epath.Path(tempfile.gettempdir())
+    hash = _sha256(url.encode())
+    return tempdir / f"croissant-editor-{hash}"
+def download_file(url: str, file_path: epath.Path):
+    """Downloads the file locally to `file_path`."""
+    with requests.get(url, stream=True) as request:
+        request.raise_for_status()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir = epath.Path(tmpdir) / "file"
+            with tmpdir.open("wb") as file:
+                for chunk in request.iter_content(chunk_size=8192):
+                    file.write(chunk)
+            tmpdir.copy(file_path)
+def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
+    """Gets the df associated to the file."""
+    if file_type == FileTypes.CSV:
+        return pd.read_csv(file)
+    elif file_type == FileTypes.EXCEL:
+        return pd.read_excel(file)
+    elif file_type == FileTypes.JSON:
+        return pd.read_json(file)
+    elif file_type == FileTypes.JSONL:
+        return pd.read_json(file, lines=True)
+    elif file_type == FileTypes.PARQUET:
+        return pd.read_parquet(file)
+    else:
+        raise NotImplementedError()
+def file_from_url(file_type: FileType, url: str, names: set[str]) -> FileObject:
+    """Downloads locally and extracts the file information."""
+    file_path = hash_file_path(url)
+    if not file_path.exists():
+        download_file(url, file_path)
+    with file_path.open("rb") as file:
+        sha256 = _sha256(file.read())
+    df = get_dataframe(file_type, file_path).infer_objects()
+    return FileObject(
+        name=find_unique_name(names, url.split("/")[-1]),
+        description="",
+        content_url=url,
+        encoding_format=file_type.encoding_format,
+        sha256=sha256,
+        df=df,
+    )
+def file_from_upload(
+    file_type: FileType, file: io.BytesIO, names: set[str]
+) -> FileObject:
+    """Uploads locally and extracts the file information."""
+    sha256 = _sha256(file.getvalue())
+    df = get_dataframe(file_type, file).infer_objects()
+    return FileObject(
+        name=find_unique_name(names, file.name),
+        description="",
+        content_url=f"data/{file.name}",
+        encoding_format=file_type.encoding_format,
+        sha256=sha256,
+        df=df,
+    )
+def file_from_form(
+    file_type: FileType, type: str, name, description, sha256: str, names: set[str]
+) -> FileObject | FileSet:
+    """Creates a file based on manually added fields."""
+    if type == FILE_OBJECT:
+        return FileObject(
+            name=find_unique_name(names, name),
+            description=description,
+            content_url="",
+            encoding_format=file_type.encoding_format,
+            sha256=sha256,
+            df=None,
+        )
+    elif type == FILE_SET:
+        return FileSet(
+            name=find_unique_name(names, name),
+            description=description,
+            encoding_format=file_type.encoding_format,
+        )
+    else:
+        raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")

core/files_test.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from etils import epath
+import pandas as pd
+import pytest
+from .files import file_from_url
+from .files import FileTypes
+def test_check_file_csv():
+    csv = epath.Path(
+        # This is the hash path for "https://my.url".
+        "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
+    )
+    if csv.exists():
+        csv.unlink()
+    with csv.open("w") as f:
+        f.write("column1,column2\n")
+        f.write("a,1\n")
+        f.write("b,2\n")
+        f.write("c,3\n")
+    file = file_from_url(FileTypes.CSV, "https://my.url", set())
+    pd.testing.assert_frame_equal(
+        file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
+    )
+    # Fails with unknown encoding_format:
+    with pytest.raises(NotImplementedError):
+        file_from_url("unknown", "https://my.url", set())

core/names.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Module to handle naming of RecordSets and distribution."""
+def find_unique_name(names: set[str], name: str):
+    """Find a unique UID."""
+    while name in names:
+        name = f"{name}_0"
+    return name

core/names_test.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Tests for `names` module."""
+from .names import find_unique_name
+def test_find_unique_name():
+    names = set(["first", "second", "first_0"])
+    assert find_unique_name(names, "first") == "first_0_0"
+    assert find_unique_name(names, "second") == "second_0"
+    assert find_unique_name(names, "third") == "third"

core/past_projects.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import logging
+import pickle
+from etils import epath
+import streamlit as st
+from core.constants import PAST_PROJECTS_PATH
+from core.state import CurrentProject
+from core.state import Metadata
+def load_past_projects_paths() -> list[epath.Path]:
+    PAST_PROJECTS_PATH.mkdir(parents=True, exist_ok=True)
+    return sorted(list(PAST_PROJECTS_PATH.iterdir()), reverse=True)
+def _pickle_file(path: epath.Path) -> epath.Path:
+    return path / ".metadata.pkl"
+def save_current_project():
+    metadata = st.session_state[Metadata]
+    project = st.session_state[CurrentProject]
+    project.path.mkdir(parents=True, exist_ok=True)
+    with _pickle_file(project.path).open("wb") as file:
+        try:
+            pickle.dump(metadata, file)
+        except pickle.PicklingError:
+            logging.error("Could not pickle metadata.")
+def open_project(path: epath.Path) -> Metadata:
+    with _pickle_file(path).open("rb") as file:
+        return pickle.load(file)

core/record_sets.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from core.data_types import convert_dtype
+from core.names import find_unique_name
+from core.state import Field
+from core.state import FileObject
+from core.state import FileSet
+from core.state import RecordSet
+import mlcroissant as mlc
+def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]:
+    """Infers one or several ml:RecordSets from a FileOject/FileSet."""
+    # For the moment, there is no inference support for FileSets.
+    if isinstance(file, FileSet):
+        return []
+    # We can infer only if the underlying `pd.DataFrame` could be built.
+    if file.df is None:
+        return []
+    fields = []
+    for column, value in file.df.dtypes.items():
+        source = mlc.Source(
+            uid=file.name,
+            node_type="distribution",
+            extract=mlc.Extract(column=column),
+        )
+        field = Field(
+            name=column,
+            data_types=[convert_dtype(value)],
+            source=source,
+            references=mlc.Source(),
+        )
+        fields.append(field)
+    return [
+        RecordSet(
+            fields=fields,
+            name=find_unique_name(names, file.name + "_record_set"),
+            description="",
+        )
+    ]

core/state.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Streamlit session state.
+In the future, this could be the serialization format between front and back.
+"""
+from __future__ import annotations
+import dataclasses
+import datetime
+from typing import Any
+from etils import epath
+import pandas as pd
+from core.constants import PAST_PROJECTS_PATH
+from core.constants import PROJECT_FOLDER_PATTERN
+import mlcroissant as mlc
+def create_class(mlc_class: type, instance: Any, **kwargs) -> Any:
+    """Creates the mlcroissant class `mlc_class` from the editor `instance`."""
+    fields = dataclasses.fields(mlc_class)
+    params: dict[str, Any] = {}
+    for field in fields:
+        name = field.name
+        if hasattr(instance, name) and name not in kwargs:
+            params[name] = getattr(instance, name)
+    return mlc_class(**params, **kwargs)
+class CurrentStep:
+    """Holds all major state variables for the application."""
+    splash = "splash"
+    editor = "editor"
+@dataclasses.dataclass
+class CurrentProject:
+    """The selected project."""
+    path: epath.Path
+    @classmethod
+    def create_new(cls) -> CurrentProject:
+        timestamp = datetime.datetime.now().strftime(PROJECT_FOLDER_PATTERN)
+        return CurrentProject(path=PAST_PROJECTS_PATH / timestamp)
+class SelectedResource:
+    """The selected FileSet or FileObject on the `Resources` page."""
+    pass
+@dataclasses.dataclass
+class SelectedRecordSet:
+    """The selected RecordSet on the `RecordSets` page."""
+    record_set_key: int
+    record_set: RecordSet
+@dataclasses.dataclass
+class FileObject:
+    """FileObject analogue for editor"""
+    name: str | None = None
+    description: str | None = None
+    contained_in: list[str] | None = dataclasses.field(default_factory=list)
+    content_size: str | None = None
+    content_url: str | None = None
+    encoding_format: str | None = None
+    sha256: str | None = None
+    df: pd.DataFrame | None = None
+    rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+@dataclasses.dataclass
+class FileSet:
+    """FileSet analogue for editor"""
+    contained_in: list[str] = dataclasses.field(default_factory=list)
+    description: str | None = None
+    encoding_format: str | None = ""
+    includes: str | None = ""
+    name: str = ""
+    rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+@dataclasses.dataclass
+class Field:
+    """Field analogue for editor"""
+    name: str | None = None
+    description: str | None = None
+    data_types: str | list[str] | None = None
+    source: mlc.Source | None = None
+    rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+    references: mlc.Source | None = None
+@dataclasses.dataclass
+class RecordSet:
+    """Record Set analogue for editor"""
+    name: str = ""
+    data: Any = None
+    description: str | None = None
+    is_enumeration: bool | None = None
+    key: str | list[str] | None = None
+    fields: list[Field] = dataclasses.field(default_factory=list)
+    rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+@dataclasses.dataclass
+class Metadata:
+    """main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
+    name: str = ""
+    description: str | None = None
+    citation: str | None = None
+    license: str | None = ""
+    url: str = ""
+    distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
+    record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
+    rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
+    def __bool__(self):
+        return self.name != "" and self.url != ""
+    def rename_distribution(self, old_name: str, new_name: str):
+        """Renames a resource by changing all the references to this resource."""
+        # Update other resources:
+        for i, resource in enumerate(self.distribution):
+            contained_in = resource.contained_in
+            if contained_in and old_name in contained_in:
+                self.distribution[i].contained_in = [
+                    new_name if name == old_name else name for name in contained_in
+                ]
+        # Updating source/references works just as with RecordSets.
+        self.rename_record_set(old_name, new_name)
+    def rename_record_set(self, old_name: str, new_name: str):
+        """Renames a RecordSet by changing all the references to this RecordSet."""
+        for i, record_set in enumerate(self.record_sets):
+            for j, field in enumerate(record_set.fields):
+                # Update source
+                source = field.source
+                if source and source.uid and source.uid.startswith(old_name):
+                    new_uid = source.uid.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].source.uid = new_uid
+                # Update references
+                references = field.references
+                if (
+                    references
+                    and references.uid
+                    and references.uid.startswith(old_name)
+                ):
+                    new_uid = references.uid.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].references.uid = new_uid
+    def rename_field(self, old_name: str, new_name: str):
+        """Renames a field by changing all the references to this field."""
+        for i, record_set in enumerate(self.record_sets):
+            for j, field in enumerate(record_set.fields):
+                # Update source
+                source = field.source
+                # The difference with RecordSet is the `.endswith` here:
+                if (
+                    source
+                    and source.uid
+                    and "/" in source.uid
+                    and source.uid.endswith(old_name)
+                ):
+                    new_uid = source.uid.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].source.uid = new_uid
+                # Update references
+                references = field.references
+                if (
+                    references
+                    and references.uid
+                    and "/" in references.uid
+                    and references.uid.endswith(old_name)
+                ):
+                    new_uid = references.uid.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].references.uid = new_uid
+    def add_distribution(self, distribution: FileSet | FileObject) -> None:
+        self.distribution.append(distribution)
+    def remove_distribution(self, key: int) -> None:
+        del self.distribution[key]
+    def add_record_set(self, record_set: RecordSet) -> None:
+        self.record_sets.append(record_set)
+    def remove_record_set(self, key: int) -> None:
+        del self.record_sets[key]
+    def _find_record_set(self, record_set_key: int) -> RecordSet:
+        if record_set_key >= len(self.record_sets):
+            raise ValueError(f"Wrong index when finding a RecordSet: {record_set_key}")
+        return self.record_sets[record_set_key]
+    def add_field(self, record_set_key: int, field: Field) -> None:
+        record_set = self._find_record_set(record_set_key)
+        record_set.fields.append(field)
+    def remove_field(self, record_set_key: int, field_key: int) -> None:
+        record_set = self._find_record_set(record_set_key)
+        if field_key >= len(record_set.fields):
+            raise ValueError(f"Wrong index when removing field: {field_key}")
+        del record_set.fields[field_key]
+    def to_canonical(self) -> mlc.Metadata:
+        distribution = []
+        for file in self.distribution:
+            if isinstance(file, FileObject):
+                distribution.append(create_class(mlc.FileObject, file))
+            elif isinstance(file, FileSet):
+                distribution.append(create_class(mlc.FileSet, file))
+        record_sets = []
+        for record_set in self.record_sets:
+            fields = []
+            for field in record_set.fields:
+                fields.append(create_class(mlc.Field, field))
+            record_sets.append(create_class(mlc.RecordSet, record_set, fields=fields))
+        return create_class(
+            mlc.Metadata,
+            self,
+            distribution=distribution,
+            record_sets=record_sets,
+        )
+    @classmethod
+    def from_canonical(cls, canonical_metadata: mlc.Metadata) -> Metadata:
+        distribution = []
+        for file in canonical_metadata.distribution:
+            if isinstance(file, mlc.FileObject):
+                distribution.append(create_class(FileObject, file))
+            else:
+                distribution.append(create_class(FileSet, file))
+        record_sets = []
+        for record_set in canonical_metadata.record_sets:
+            fields = []
+            for field in record_set.fields:
+                fields.append(create_class(Field, field))
+            record_sets.append(
+                create_class(
+                    RecordSet,
+                    record_set,
+                    fields=fields,
+                )
+            )
+        return create_class(
+            cls,
+            canonical_metadata,
+            distribution=distribution,
+            record_sets=record_sets,
+        )

cypress.config.js ADDED Viewed

	@@ -0,0 +1,7 @@

+const { defineConfig } = require("cypress");
+module.exports = defineConfig({
+  // To access content within Streamlit iframes for custom components:
+  chromeWebSecurity: false,
+  e2e: {},
+});

cypress/downloads/croissant-Titanic.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"@context": {"@language": "en", "@vocab": "https://schema.org/", "column": "ml:column", "data": {"@id": "ml:data", "@type": "@json"}, "dataType": {"@id": "ml:dataType", "@type": "@vocab"}, "extract": "ml:extract", "field": "ml:field", "fileProperty": "ml:fileProperty", "format": "ml:format", "includes": "ml:includes", "isEnumeration": "ml:isEnumeration", "jsonPath": "ml:jsonPath", "ml": "http://mlcommons.org/schema/", "parentField": "ml:parentField", "path": "ml:path", "recordSet": "ml:recordSet", "references": "ml:references", "regex": "ml:regex", "repeated": "ml:repeated", "replace": "ml:replace", "sc": "https://schema.org/", "separator": "ml:separator", "source": "ml:source", "subField": "ml:subField", "transform": "ml:transform", "wd": "https://www.wikidata.org/wiki/"}, "@type": "sc:Dataset", "name": "Titanic", "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "Public", "url": "https://www.openml.org/d/40945", "distribution": [{"@type": "sc:FileObject", "name": "passengers.csv", "contentSize": "117743 B", "contentUrl": "https://www.openml.org/data/get_csv/16826755/phpMYEkMl", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "genders.csv", "description": "Maps gender values (\"male\", \"female\") to semantic URLs.", "contentSize": "117743 B", "contentUrl": "data/genders.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}, {"@type": "sc:FileObject", "name": "embarkation_ports.csv", "description": "Maps Embarkation port initial to labeled values.", "contentSize": "117743 B", "contentUrl": "data/embarkation_ports.csv", "encodingFormat": "text/csv", "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"}], "recordSet": [{"@type": "ml:RecordSet", "name": "genders", "description": "Maps gender labels to semantic definitions.", "isEnumeration": true, "key": "label", "field": [{"@type": "ml:Field", "name": "label", "description": "One of {\"male\", \"female\"}", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "genders.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q48277"], "source": {"distribution": "genders.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "embarkation_ports", "description": "Maps Embarkation port initial to labeled values.", "isEnumeration": true, "key": "key", "field": [{"@type": "ml:Field", "name": "key", "description": "C, Q, S or ?", "dataType": "sc:Text", "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "key"}}}, {"@type": "ml:Field", "name": "label", "description": "Human-readable label", "dataType": ["sc:Text", "sc:name"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "label"}}}, {"@type": "ml:Field", "name": "url", "description": "Corresponding WikiData URL", "dataType": ["sc:URL", "wd:Q515"], "source": {"distribution": "embarkation_ports.csv", "extract": {"column": "url"}}}]}, {"@type": "ml:RecordSet", "name": "passengers", "description": "The list of passengers. Does not include crew members.", "field": [{"@type": "ml:Field", "name": "name", "description": "Name of the passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "name"}}}, {"@type": "ml:Field", "name": "gender", "description": "Gender of passenger (male or female)", "dataType": "sc:Text", "references": {"field": "genders/label"}, "source": {"distribution": "passengers.csv", "extract": {"column": "sex"}}}, {"@type": "ml:Field", "name": "age", "description": "Age of passenger at time of death. It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "age"}}}, {"@type": "ml:Field", "name": "survived", "description": "Survival status of passenger (0: Lost, 1: Saved)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "survived"}}}, {"@type": "ml:Field", "name": "pclass", "description": "Passenger Class (1st/2nd/3rd)", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "pclass"}}}, {"@type": "ml:Field", "name": "cabin", "description": "Passenger cabin.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "cabin"}}}, {"@type": "ml:Field", "name": "embarked", "description": "Port of Embarkation (C: Cherbourg, Q: Queenstown, S: Southampton, ?: Unknown).", "dataType": "sc:Text", "references": {"field": "embarkation_ports/key"}, "source": {"distribution": "passengers.csv", "extract": {"column": "embarked"}}}, {"@type": "ml:Field", "name": "fare", "description": "Passenger Fare (British pound). It's a string, because some values can be `?`.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "fare"}}}, {"@type": "ml:Field", "name": "home_destination", "description": "Home and destination", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "home.dest"}}}, {"@type": "ml:Field", "name": "ticket", "description": "Ticket Number, may include a letter.", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "ticket"}}}, {"@type": "ml:Field", "name": "num_parents_children", "description": "Number of Parents/Children Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "parch"}}}, {"@type": "ml:Field", "name": "num_siblings_spouses", "description": "Number of Siblings/Spouses Aboard", "dataType": "sc:Integer", "source": {"distribution": "passengers.csv", "extract": {"column": "sibsp"}}}, {"@type": "ml:Field", "name": "boat", "description": "Lifeboat used by passenger", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "boat"}}}, {"@type": "ml:Field", "name": "body", "description": "Body Identification Number", "dataType": "sc:Text", "source": {"distribution": "passengers.csv", "extract": {"column": "body"}}}]}]}

cypress/downloads/croissant.json ADDED Viewed

	@@ -0,0 +1 @@

cypress/e2e/createManually.cy.js ADDED Viewed

	@@ -0,0 +1,35 @@

+/// <reference types="cypress" />
+import 'cypress-file-upload';
+import 'cypress-iframe';
+describe('Create a resource manually', () => {
+  it('should allow adding a FileObject resource', () => {
+    // Streamlit starts on :8501.
+    cy.visit('http://localhost:8501')
+    cy.get('button', {timeout: 10000}).contains('Create', {timeout: 10000}).click()
+    cy.get('input[aria-label="Name:red[*]"]').type('MyDataset').blur()
+    cy.get('[data-testid="stMarkdownContainer"]')
+    .contains('Metadata')
+    .click()
+    cy.get('input[aria-label="URL:red[*]"]').type('https://mydataset.com', {force: true})
+    // Create a resource manually.
+    cy.get('[data-testid="stMarkdownContainer"]').contains('Resources').click()
+    cy.get('[data-testid="stMarkdownContainer"]').contains('Add manually').click()
+    cy.get('input[aria-label="File name:red[*]"]').type('test.csv').blur()
+    cy.get('input[aria-label="SHA256"]').type('abcdefgh1234567').blur()
+    cy.get('button').contains('Upload').click()
+    // The file is created, so we can click on it to see the details.
+    cy.enter('[title="components.tree.tree_component"]').then(getBody => {
+      getBody().contains('test.csv').click()
+    })
+    cy.get('input[aria-label="SHA256:red[*]"]')
+        .should('be.disabled')
+        .should('have.value', 'abcdefgh1234567')
+  })
+})

cypress/e2e/displayErrors.cy.js ADDED Viewed

	@@ -0,0 +1,30 @@

+/// <reference types="cypress" />
+import 'cypress-file-upload';
+describe('load existing errored croissant', () => {
+  it('should display errors', () => {
+    cy.visit('http://localhost:8501')
+    cy.fixture('coco.json').then((fileContent) => {
+      const file = {
+        fileContent,
+        fileName: 'coco.json', mimeType: 'text/json',
+      }
+      cy.get(
+        "[data-testid='stFileUploadDropzone']",
+      ).attachFile(file, {
+        force: true,
+        subjectType: "drag-n-drop",
+        events: ["dragenter", "drop"],
+      })
+    })
+    cy.get('[data-testid="stMarkdownContainer"]').contains("Errors").should('not.exist')
+    // Empty the `name` field to create an error:
+    cy.get('[data-testid="stMarkdownContainer"]').contains('RecordSets').click()
+    cy.contains('split_enums (2 fields)').click()
+    cy.get('input[aria-label="Name:red[*]"][value="split_enums"]').should('be.visible').type('{selectall}{backspace}{enter}')
+    cy.get('[data-testid="stMarkdownContainer"]').contains('Overview').click()
+    cy.get('[data-testid="stMarkdownContainer"]').contains("Errors").should('exist')
+  })
+})

cypress/e2e/loadCroissant.cy.js ADDED Viewed

	@@ -0,0 +1,61 @@

+/// <reference types="cypress" />
+import 'cypress-file-upload';
+import * as path from 'path';
+describe('Editor loads Croissant without Error', () => {
+  it('should allow uploading existing croissant files', () => {
+    cy.visit('http://localhost:8501')
+    cy.fixture('titanic.json').then((fileContent) => {
+      const file = {
+        fileContent,
+        fileName: 'titanic.json', mimeType: 'text/json',
+      }
+      cy.get(
+        "[data-testid='stFileUploadDropzone']",
+      ).attachFile(file, {
+        force: true,
+        subjectType: "drag-n-drop",
+        events: ["dragenter", "drop"],
+      })
+    })
+    cy.get('button').contains('Metadata').click()
+    cy
+    .get("[data-testid='element-container']")
+    .contains('Titanic')
+    .should('exist')
+  })
+  it('should download as json', () => {
+    cy.visit('http://localhost:8501')
+    cy.fixture('titanic.json').then((fileContent) => {
+      const file = {
+        fileContent,
+        fileName: 'titanic.json', mimeType: 'text/json',
+      }
+      cy.get(
+        "[data-testid='stFileUploadDropzone']",
+      ).attachFile(file, {
+        force: true,
+        subjectType: "drag-n-drop",
+        events: ["dragenter", "drop"],
+      })
+    })
+    cy.get('[data-testid="stException"]').should('not.exist')
+    cy.get('button').contains('Export').should('exist').should('be.visible').click({force: true})
+    cy.fixture('titanic.json').then((fileContent) => {
+      const downloadsFolder = Cypress.config("downloadsFolder");
+      cy.readFile(path.join(downloadsFolder, "croissant-titanic.json"))
+      .then((downloadedFile) => {
+        downloadedFile = JSON.stringify(downloadedFile)
+        return downloadedFile
+      })
+      .should('deep.equal', JSON.stringify(fileContent))
+    })
+  })
+})

cypress/e2e/renameDistribution.cy.js ADDED Viewed

	@@ -0,0 +1,36 @@

+/// <reference types="cypress" />
+import 'cypress-file-upload';
+import 'cypress-iframe';
+describe('Renaming of FileObjects/FileSets/RecordSets/Fields.', () => {
+  it('should rename the FileObject/FileSet everywhere', () => {
+    cy.visit('http://localhost:8501')
+    cy.fixture('titanic.json').then((fileContent) => {
+      const file = {
+        fileContent,
+        fileName: 'titanic.json', mimeType: 'text/json',
+      }
+      cy.get(
+        "[data-testid='stFileUploadDropzone']",
+      ).attachFile(file, {
+        force: true,
+        subjectType: "drag-n-drop",
+        events: ["dragenter", "drop"],
+      })
+    })
+    cy.get('button').contains('Resources').click()
+    cy.enter('[title="components.tree.tree_component"]').then(getBody => {
+      // Click on genders.csv
+      getBody().contains('genders.csv').click()
+    })
+    cy.get('input[aria-label="Name:red[*]"][value="genders.csv"]').type('{selectall}{backspace}the-new-name{enter}')
+    cy.get('button').contains('RecordSets').click()
+    cy.contains('genders').click()
+    cy.contains('Edit fields details').click()
+    cy.contains('the-new-name')
+  })
+})

cypress/e2e/uploadCsv.cy.js ADDED Viewed

	@@ -0,0 +1,59 @@

+/// <reference types="cypress" />
+import 'cypress-file-upload';
+import 'cypress-iframe';
+describe('Editor loads a local CSV as a resource', () => {
+  it('should display the form: Overview, Metadata, Resources, & Record Sets', () => {
+    // Streamlit starts on :8501.
+    cy.visit('http://localhost:8501')
+    cy.get('button', {timeout: 10000}).contains('Create', {timeout: 10000}).click()
+    cy.get('input[aria-label="Name:red[*]"]').type('MyDataset').blur()
+    cy.get('[data-testid="stMarkdownContainer"]')
+    .contains('Metadata')
+    .click()
+    cy.get('input[aria-label="URL:red[*]"]').type('https://mydataset.com', {force: true})
+    cy.get('[data-testid="stMarkdownContainer"]').contains('Resources').click()
+    // Drag and drop mimicking: streamlit/e2e/specs/st_file_uploader.spec.js.
+    cy.fixture('base.csv').then((fileContent) => {
+      const file = {
+        fileContent,
+        fileName: 'base.csv', mimeType: 'text/csv',
+      }
+      cy.get(
+        "[data-testid='stFileUploadDropzone']",
+      ).attachFile(file, {
+        force: true,
+        subjectType: "drag-n-drop",
+        events: ["dragenter", "drop"],
+      })
+    })
+    cy.get('.uploadedFileData').contains('base.csv')
+    cy.get('button').contains('Upload').click()
+    // The file is uploaded, so we can click on it to see the details.
+    // Waiting a few seconds to wait for the resource to download.
+    cy.wait(2000)
+    cy.enter('[title="components.tree.tree_component"]').then(getBody => {
+      getBody().find('li').should('be.visible').click()
+    })
+    // For example, we see the first rows:
+    cy.contains('First rows of data:')
+    // On the record set page, we see the record set.
+    cy.get('[data-testid="stMarkdownContainer"]').contains('RecordSets').click()
+    cy.contains('base.csv_record_set (2 fields)').click()
+    // We also see the fields with the proper types.
+    cy.get('[data-testid="stDataFrameResizable"]').contains("column1")
+    cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Text")
+    cy.get('[data-testid="stDataFrameResizable"]').contains("column2")
+    cy.get('[data-testid="stDataFrameResizable"]').contains("https://schema.org/Integer")
+    // I can edit the details of the fields.
+    cy.contains('Edit fields details').click()
+    cy.get('input[aria-label="Description"]').last().type('This is a nice custom description!{enter}')
+    cy.get('[data-testid="glide-cell-2-1"]').contains("This is a nice custom description!")
+  })
+})

cypress/fixtures/base.csv ADDED Viewed

	@@ -0,0 +1,4 @@

+column1,column2
+A,1
+B,2
+C,3

cypress/fixtures/coco.json ADDED Viewed

	@@ -0,0 +1,409 @@

+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "column": "ml:column",
+    "data": {
+      "@id": "ml:data",
+      "@type": "@json"
+    },
+    "dataType": {
+      "@id": "ml:dataType",
+      "@type": "@vocab"
+    },
+    "extract": "ml:extract",
+    "field": "ml:field",
+    "fileProperty": "ml:fileProperty",
+    "format": "ml:format",
+    "includes": "ml:includes",
+    "isEnumeration": "ml:isEnumeration",
+    "jsonPath": "ml:jsonPath",
+    "ml": "http://mlcommons.org/schema/",
+    "parentField": "ml:parentField",
+    "path": "ml:path",
+    "recordSet": "ml:recordSet",
+    "references": "ml:references",
+    "regex": "ml:regex",
+    "repeated": "ml:repeated",
+    "replace": "ml:replace",
+    "sc": "https://schema.org/",
+    "separator": "ml:separator",
+    "source": "ml:source",
+    "subField": "ml:subField",
+    "transform": "ml:transform",
+    "wd": "https://www.wikidata.org/wiki/"
+  },
+  "@type": "sc:Dataset",
+  "name": "COCO",
+  "description": "COCO is a large-scale object detection, segmentation, and captioning dataset.  WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.",
+  "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n  author    = {Tsung{-}Yi Lin and\n               Michael Maire and\n               Serge J. Belongie and\n               Lubomir D. Bourdev and\n               Ross B. Girshick and\n               James Hays and\n               Pietro Perona and\n               Deva Ramanan and\n               Piotr Doll{'{a}}r and\n               C. Lawrence Zitnick},\n  title     = {Microsoft {COCO:} Common Objects in Context},\n  journal   = {CoRR},\n  volume    = {abs/1405.0312},\n  year      = {2014},\n  url       = {http://arxiv.org/abs/1405.0312},\n  archivePrefix = {arXiv},\n  eprint    = {1405.0312},\n  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}",
+  "license": [
+    "Creative Commons Attribution 4.0 License",
+    "https://www.flickr.com/creativecommons/"
+  ],
+  "url": "https://cocodataset.org/",
+  "distribution": [
+    {
+      "@type": "sc:FileObject",
+      "name": "train2014.zip",
+      "contentSize": "13510573713 B",
+      "contentUrl": "http://images.cocodataset.org/zips/train2014.zip",
+      "encodingFormat": "application/zip",
+      "sha256": "sha256"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "val2014.zip",
+      "contentSize": "6645013297 B",
+      "contentUrl": "http://images.cocodataset.org/zips/val2014.zip",
+      "encodingFormat": "application/zip",
+      "sha256": "sha256"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "test2014.zip",
+      "contentSize": "6660437059 B",
+      "contentUrl": "http://images.cocodataset.org/zips/test2014.zip",
+      "encodingFormat": "application/zip",
+      "sha256": "sha256"
+    },
+    {
+      "@type": "sc:FileSet",
+      "name": "image-files",
+      "containedIn": [
+        "train2014.zip",
+        "val2014.zip",
+        "test2014.zip"
+      ],
+      "encodingFormat": "image/jpeg",
+      "includes": "*.jpg"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "annotations_trainval2014.zip",
+      "contentSize": "252872794 B",
+      "contentUrl": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
+      "encodingFormat": "application/zip",
+      "sha256": "sha256"
+    },
+    {
+      "@type": "sc:FileSet",
+      "name": "caption_annotations-files",
+      "containedIn": "annotations_trainval2014.zip",
+      "encodingFormat": "application/json",
+      "includes": "annotations/captions_(val|train)2014.json"
+    },
+    {
+      "@type": "sc:FileSet",
+      "name": "person_keypoints_annotations",
+      "containedIn": "annotations_trainval2014.zip",
+      "encodingFormat": "application/json",
+      "includes": "annotations/person_keypoints_(val|train)2014.json"
+    },
+    {
+      "@type": "sc:FileSet",
+      "name": "instancesperson_keypoints_annotations",
+      "containedIn": "annotations_trainval2014.zip",
+      "encodingFormat": "application/json",
+      "includes": "annotations/instances_(val|train)2014.json"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "image_info_test2014.zip",
+      "contentSize": "763464 B",
+      "contentUrl": "http://images.cocodataset.org/annotations/image_info_test2014.zip",
+      "encodingFormat": "application/zip",
+      "sha256": "sha256"
+    },
+    {
+      "@type": "sc:FileSet",
+      "name": "imageinfo",
+      "containedIn": "image_info_test2014.zip",
+      "encodingFormat": "application/json",
+      "includes": "annotations/image_info_test.json"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "ml:RecordSet",
+      "name": "split_enums",
+      "description": "Maps split names to semantic values.",
+      "key": "name",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "name",
+          "description": "One of: train, val, test.",
+          "dataType": "sc:Text"
+        },
+        {
+          "@type": "ml:Field",
+          "name": "url",
+          "description": "Corresponding mlcommons.org definition URL",
+          "dataType": [
+            "sc:URL",
+            "wd:Q3985153"
+          ]
+        }
+      ],
+      "data": [
+        {
+          "name": "train",
+          "url": "https://mlcommons.org/definitions/training_split"
+        },
+        {
+          "name": "val",
+          "url": "https://mlcommons.org/definitions/validation_split"
+        },
+        {
+          "name": "test",
+          "url": "https://mlcommons.org/definitions/test_split"
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "images",
+      "key": "image_id",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "image_id",
+          "description": "The filename of the image. eg: COCO_train2014_000000000003.jpg",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "image-files",
+            "extract": {
+              "fileProperty": "filename"
+            },
+            "transform": {
+              "regex": "^COCO_[train|val|test]2014_(\\d+)\\.jpg$"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "image_filename",
+          "description": "The filename of the image. eg: COCO_train2014_000000000003.jpg",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "image-files",
+            "extract": {
+              "fileProperty": "filename"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "image_content",
+          "description": "The content of the image.",
+          "dataType": "sc:ImageObject",
+          "source": {
+            "distribution": "image-files",
+            "extract": {
+              "fileProperty": "content"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "split",
+          "dataType": [
+            "sc:Text",
+            "wd:Q3985153"
+          ],
+          "references": {
+            "field": "split_enums/name"
+          },
+          "source": {
+            "distribution": "image-files",
+            "extract": {
+              "fileProperty": "fullpath"
+            },
+            "transform": {
+              "regex": "^(train|val|test)2014/.*\\.jpg$"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "captions",
+      "key": "id",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "id",
+          "description": "The ID of the caption",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "caption_annotations-files",
+            "extract": {
+              "column": "id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "image_id",
+          "description": "The ID of the image",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "caption_annotations-files",
+            "extract": {
+              "column": "image_id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "caption",
+          "description": "The caption",
+          "dataType": [
+            "sc:Text",
+            "wd:Q18585177"
+          ],
+          "source": {
+            "distribution": "caption_annotations-files",
+            "extract": {
+              "column": "caption"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "split",
+          "dataType": [
+            "sc:Text",
+            "wd:Q3985153"
+          ],
+          "references": {
+            "field": "split_enums/name"
+          },
+          "source": {
+            "distribution": "caption_annotations-files",
+            "extract": {
+              "fileProperty": "filename"
+            },
+            "transform": {
+              "regex": ".*_(val|train)2014\\.json$"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "categories",
+      "isEnumeration": true,
+      "key": "id",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "id",
+          "description": "The ID of the category",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "name",
+          "description": "The name of the category.",
+          "dataType": [
+            "sc:Text",
+            "sc:name"
+          ],
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "name"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "supercategory",
+          "description": "The name of the supercategory.",
+          "dataType": [
+            "sc:Text",
+            "sc:name"
+          ],
+          "isEnumeration": true,
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "supercategory"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "annotations",
+      "key": "id",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "id",
+          "description": "The ID of the annotation.",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "category_id",
+          "description": "The ID of the category.",
+          "dataType": "sc:Integer",
+          "references": {
+            "field": "categories/id"
+          },
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "category_id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "image_id",
+          "description": "The ID of the image.",
+          "dataType": "sc:Integer",
+          "references": {
+            "field": "images/image_id"
+          },
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "image_id"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "bbox",
+          "description": "The bounding box around annotated object[s].",
+          "dataType": "ml:BoundingBox",
+          "source": {
+            "distribution": "instancesperson_keypoints_annotations",
+            "extract": {
+              "column": "bbox"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}

cypress/fixtures/titanic.json ADDED Viewed

	@@ -0,0 +1,343 @@

+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "column": "ml:column",
+    "data": {
+      "@id": "ml:data",
+      "@type": "@json"
+    },
+    "dataType": {
+      "@id": "ml:dataType",
+      "@type": "@vocab"
+    },
+    "extract": "ml:extract",
+    "field": "ml:field",
+    "fileProperty": "ml:fileProperty",
+    "format": "ml:format",
+    "includes": "ml:includes",
+    "isEnumeration": "ml:isEnumeration",
+    "jsonPath": "ml:jsonPath",
+    "ml": "http://mlcommons.org/schema/",
+    "parentField": "ml:parentField",
+    "path": "ml:path",
+    "recordSet": "ml:recordSet",
+    "references": "ml:references",
+    "regex": "ml:regex",
+    "repeated": "ml:repeated",
+    "replace": "ml:replace",
+    "sc": "https://schema.org/",
+    "separator": "ml:separator",
+    "source": "ml:source",
+    "subField": "ml:subField",
+    "transform": "ml:transform",
+    "wd": "https://www.wikidata.org/wiki/"
+  },
+  "@type": "sc:Dataset",
+  "name": "Titanic",
+  "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n",
+  "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n",
+  "license": "Public",
+  "url": "https://www.openml.org/d/40945",
+  "distribution": [
+    {
+      "@type": "sc:FileObject",
+      "name": "passengers.csv",
+      "contentSize": "117743 B",
+      "contentUrl": "https://www.openml.org/data/get_csv/16826755/phpMYEkMl",
+      "encodingFormat": "text/csv",
+      "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "genders.csv",
+      "description": "Maps gender values (\"male\", \"female\") to semantic URLs.",
+      "contentSize": "117743 B",
+      "contentUrl": "data/genders.csv",
+      "encodingFormat": "text/csv",
+      "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
+    },
+    {
+      "@type": "sc:FileObject",
+      "name": "embarkation_ports.csv",
+      "description": "Maps Embarkation port initial to labeled values.",
+      "contentSize": "117743 B",
+      "contentUrl": "data/embarkation_ports.csv",
+      "encodingFormat": "text/csv",
+      "sha256": "c617db2c7470716250f6f001be51304c76bcc8815527ab8bae734bdca0735737"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "ml:RecordSet",
+      "name": "genders",
+      "description": "Maps gender labels to semantic definitions.",
+      "isEnumeration": true,
+      "key": "label",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "label",
+          "description": "One of {\"male\", \"female\"}",
+          "dataType": [
+            "sc:Text",
+            "sc:name"
+          ],
+          "source": {
+            "distribution": "genders.csv",
+            "extract": {
+              "column": "label"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "url",
+          "description": "Corresponding WikiData URL",
+          "dataType": [
+            "sc:URL",
+            "wd:Q48277"
+          ],
+          "source": {
+            "distribution": "genders.csv",
+            "extract": {
+              "column": "url"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "embarkation_ports",
+      "description": "Maps Embarkation port initial to labeled values.",
+      "isEnumeration": true,
+      "key": "key",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "key",
+          "description": "C, Q, S or ?",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "embarkation_ports.csv",
+            "extract": {
+              "column": "key"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "label",
+          "description": "Human-readable label",
+          "dataType": [
+            "sc:Text",
+            "sc:name"
+          ],
+          "source": {
+            "distribution": "embarkation_ports.csv",
+            "extract": {
+              "column": "label"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "url",
+          "description": "Corresponding WikiData URL",
+          "dataType": [
+            "sc:URL",
+            "wd:Q515"
+          ],
+          "source": {
+            "distribution": "embarkation_ports.csv",
+            "extract": {
+              "column": "url"
+            }
+          }
+        }
+      ]
+    },
+    {
+      "@type": "ml:RecordSet",
+      "name": "passengers",
+      "description": "The list of passengers. Does not include crew members.",
+      "field": [
+        {
+          "@type": "ml:Field",
+          "name": "name",
+          "description": "Name of the passenger",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "name"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "gender",
+          "description": "Gender of passenger (male or female)",
+          "dataType": "sc:Text",
+          "references": {
+            "field": "genders/label"
+          },
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "sex"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "age",
+          "description": "Age of passenger at time of death. It's a string, because some values can be `?`.",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "age"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "survived",
+          "description": "Survival status of passenger (0: Lost, 1: Saved)",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "survived"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "pclass",
+          "description": "Passenger Class (1st/2nd/3rd)",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "pclass"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "cabin",
+          "description": "Passenger cabin.",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "cabin"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "embarked",
+          "description": "Port of Embarkation (C: Cherbourg, Q: Queenstown, S: Southampton, ?: Unknown).",
+          "dataType": "sc:Text",
+          "references": {
+            "field": "embarkation_ports/key"
+          },
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "embarked"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "fare",
+          "description": "Passenger Fare (British pound). It's a string, because some values can be `?`.",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "fare"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "home_destination",
+          "description": "Home and destination",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "home.dest"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "ticket",
+          "description": "Ticket Number, may include a letter.",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "ticket"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "num_parents_children",
+          "description": "Number of Parents/Children Aboard",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "parch"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "num_siblings_spouses",
+          "description": "Number of Siblings/Spouses Aboard",
+          "dataType": "sc:Integer",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "sibsp"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "boat",
+          "description": "Lifeboat used by passenger",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "boat"
+            }
+          }
+        },
+        {
+          "@type": "ml:Field",
+          "name": "body",
+          "description": "Body Identification Number",
+          "dataType": "sc:Text",
+          "source": {
+            "distribution": "passengers.csv",
+            "extract": {
+              "column": "body"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}

cypress/screenshots/uploadCsv.cy.js/Editor loads a local CSV as a resource -- should display the form Overview, Metadata, Resources, & Record Sets (failed).png ADDED Viewed

cypress/support/e2e.js ADDED Viewed

	@@ -0,0 +1,6 @@

+import "./resize_observer"
+beforeEach(() => {
+    cy.ignore_resize_observer();
+})

cypress/support/resize_observer.js ADDED Viewed

	@@ -0,0 +1,11 @@

+Cypress.Commands.add("ignore_resize_observer", () => {
+    const resizeObserverLoopErrRe = /ResizeObserver loop limit exceeded/
+    // consensus was that this exception didn't matter mostly, and was intermittent when running tests.
+    // https://stackoverflow.com/questions/63653605/resizeobserver-loop-limit-exceeded-api-is-never-used
+    Cypress.on('uncaught:exception', err => {
+      if (resizeObserverLoopErrRe.test(err.message)) {
+        return false
+      }
+    })
+})

events/__init__.py ADDED Viewed

File without changes

events/fields.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import enum
+from typing import Any
+import streamlit as st
+from core.state import Field
+from core.state import Metadata
+import mlcroissant as mlc
+class ExtractType:
+    """The type of extraction to perform."""
+    COLUMN = "Column"
+    JSON_PATH = "JSON path"
+    FILE_CONTENT = "File content"
+    FILE_NAME = "File name"
+    FILE_PATH = "File path"
+    FILE_FULLPATH = "Full path"
+    FILE_LINES = "Lines in file"
+    FILE_LINE_NUMBERS = "Line numbers in file"
+class TransformType:
+    """The type of transformation to perform."""
+    FORMAT = "Apply format"
+    JSON_PATH = "Apply JSON path"
+    REGEX = "Apply regular expression"
+    REPLACE = "Replace"
+    SEPARATOR = "Separator"
+def _get_source(source: mlc.Source | None, value: Any) -> mlc.Source:
+    if not source:
+        source = mlc.Source(extract=mlc.Extract())
+    if value == ExtractType.COLUMN:
+        source.extract = mlc.Extract(column="")
+    elif value == ExtractType.FILE_CONTENT:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.content)
+    elif value == ExtractType.FILE_NAME:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.filename)
+    elif value == ExtractType.FILE_PATH:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.filepath)
+    elif value == ExtractType.FILE_FULLPATH:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.fullpath)
+    elif value == ExtractType.FILE_LINES:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.lines)
+    elif value == ExtractType.FILE_LINE_NUMBERS:
+        source.extract = mlc.Extract(file_property=mlc.FileProperty.lineNumbers)
+    elif value == ExtractType.JSON_PATH:
+        source.extract = mlc.Extract(json_path="")
+    return source
+class FieldEvent(enum.Enum):
+    """Event that triggers a field change."""
+    NAME = "NAME"
+    DESCRIPTION = "DESCRIPTION"
+    DATA_TYPE = "DATA_TYPE"
+    SOURCE = "SOURCE"
+    SOURCE_EXTRACT = "SOURCE_EXTRACT"
+    SOURCE_EXTRACT_COLUMN = "SOURCE_EXTRACT_COLUMN"
+    SOURCE_EXTRACT_JSON_PATH = "SOURCE_EXTRACT_JSON_PATH"
+    TRANSFORM = "TRANSFORM"
+    TRANSFORM_FORMAT = "TRANSFORM_FORMAT"
+    REFERENCE = "REFERENCE"
+    REFERENCE_EXTRACT = "REFERENCE_EXTRACT"
+    REFERENCE_EXTRACT_COLUMN = "REFERENCE_EXTRACT_COLUMN"
+    REFERENCE_EXTRACT_JSON_PATH = "REFERENCE_EXTRACT_JSON_PATH"
+def handle_field_change(
+    change: FieldEvent,
+    field: Field,
+    key: str,
+    **kwargs,
+):
+    value = st.session_state[key]
+    if change == FieldEvent.NAME:
+        old_name = field.name
+        new_name = value
+        if old_name != new_name:
+            metadata: Metadata = st.session_state[Metadata]
+            metadata.rename_field(old_name=old_name, new_name=new_name)
+        field.name = value
+    elif change == FieldEvent.DESCRIPTION:
+        field.description = value
+    elif change == FieldEvent.DATA_TYPE:
+        field.data_types = [value]
+    elif change == FieldEvent.SOURCE:
+        node_type = "field" if "/" in value else "distribution"
+        source = mlc.Source(uid=value, node_type=node_type)
+        field.source = source
+    elif change == FieldEvent.SOURCE_EXTRACT:
+        source = field.source
+        source = _get_source(source, value)
+        field.source = source
+    elif change == FieldEvent.SOURCE_EXTRACT_COLUMN:
+        if not field.source:
+            field.source = mlc.Source(extract=mlc.Extract())
+        field.source.extract = mlc.Extract(column=value)
+    elif change == FieldEvent.SOURCE_EXTRACT_JSON_PATH:
+        if not field.source:
+            field.source = mlc.Source(extract=mlc.Extract())
+        field.source.extract = mlc.Extract(json_path=value)
+    elif change == FieldEvent.TRANSFORM:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform()
+    elif change == TransformType.FORMAT:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform(format=value)
+    elif change == TransformType.JSON_PATH:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform(json_path=value)
+    elif change == TransformType.REGEX:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform(regex=value)
+    elif change == TransformType.REPLACE:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform(replace=value)
+    elif change == TransformType.SEPARATOR:
+        number = kwargs.get("number")
+        if number is not None and number < len(field.source.transforms):
+            field.source.transforms[number] = mlc.Transform(separator=value)
+    elif change == FieldEvent.REFERENCE:
+        node_type = "field" if "/" in value else "distribution"
+        source = mlc.Source(uid=value, node_type=node_type)
+        field.references = source
+    elif change == FieldEvent.REFERENCE_EXTRACT:
+        source = field.references
+        source = _get_source(source, value)
+        field.references = source
+    elif change == FieldEvent.REFERENCE_EXTRACT_COLUMN:
+        if not field.references:
+            field.references = mlc.Source(extract=mlc.Extract())
+        field.references.extract = mlc.Extract(column=value)
+    elif change == FieldEvent.REFERENCE_EXTRACT_JSON_PATH:
+        if not field.references:
+            field.references = mlc.Source(extract=mlc.Extract())
+        field.references.extract = mlc.Extract(json_path=value)

events/metadata.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import enum
+import streamlit as st
+from core.state import Metadata
+class MetadataEvent(enum.Enum):
+    """Event that triggers a metadata change."""
+    NAME = "NAME"
+    DESCRIPTION = "DESCRIPTION"
+    URL = "URL"
+    LICENSE = "LICENSE"
+    CITATION = "CITATION"
+def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
+    if event == MetadataEvent.NAME:
+        metadata.name = st.session_state[key]
+    elif event == MetadataEvent.DESCRIPTION:
+        metadata.description = st.session_state[key]
+    elif event == MetadataEvent.LICENSE:
+        metadata.license = st.session_state[key]
+    elif event == MetadataEvent.CITATION:
+        metadata.citation = st.session_state[key]
+    elif event == MetadataEvent.URL:
+        metadata.url = st.session_state[key]

events/record_sets.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import enum
+import streamlit as st
+from core.state import Metadata
+from core.state import RecordSet
+class RecordSetEvent(enum.Enum):
+    """Event that triggers a RecordSet change."""
+    NAME = "NAME"
+    DESCRIPTION = "DESCRIPTION"
+    IS_ENUMERATION = "IS_ENUMERATION"
+def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key: str):
+    value = st.session_state[key]
+    if event == RecordSetEvent.NAME:
+        old_name = record_set.name
+        new_name = value
+        if old_name != new_name:
+            metadata: Metadata = st.session_state[Metadata]
+            metadata.rename_record_set(old_name=old_name, new_name=new_name)
+        record_set.name = value
+    elif event == RecordSetEvent.DESCRIPTION:
+        record_set.description = value
+    elif event == RecordSetEvent.IS_ENUMERATION:
+        record_set.is_enumeration = value

events/resources.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import enum
+import streamlit as st
+from core.state import FileObject
+from core.state import FileSet
+from core.state import Metadata
+Resource = FileObject | FileSet
+class ResourceEvent(enum.Enum):
+    """Event that triggers a resource change."""
+    NAME = "NAME"
+    DESCRIPTION = "DESCRIPTION"
+    ENCODING_FORMAT = "ENCODING_FORMAT"
+    SHA256 = "SHA256"
+    CONTENT_SIZE = "CONTENT_SIZE"
+    CONTENT_URL = "CONTENT_URL"
+def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
+    value = st.session_state[key]
+    if event == ResourceEvent.NAME:
+        old_name = resource.name
+        new_name = value
+        if old_name != new_name:
+            metadata: Metadata = st.session_state[Metadata]
+            metadata.rename_distribution(old_name=old_name, new_name=new_name)
+        resource.name = value
+    elif event == ResourceEvent.DESCRIPTION:
+        resource.description = value
+    elif event == ResourceEvent.ENCODING_FORMAT:
+        resource.encoding_format = value
+    elif event == ResourceEvent.SHA256:
+        resource.sha256 = value
+    elif event == ResourceEvent.CONTENT_SIZE:
+        resource.content_size = value
+    elif event == ResourceEvent.CONTENT_URL:
+        resource.content_url = value