Spaces:

lhoestq
/

duckdb-spreadsheets

Running

App Files Files Community

lhoestq HF staff commited on Nov 21, 2024

Commit

4c86203

1 Parent(s): bf29377

add all functions

Browse files

Files changed (7) hide show

app.py +162 -57
date_functions.tsv +24 -0
list_functions.tsv +41 -0
numeric_functions.tsv +55 -0
requirements.txt +1 -0
time_functions.tsv +11 -0
timestamp_functions.tsv +39 -0

app.py CHANGED Viewed

@@ -3,22 +3,19 @@ from functools import partial, lru_cache
 import duckdb
 import gradio as gr
 import pandas as pd
 import requests
 from huggingface_hub import HfApi
 READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
-EMPTY_DF = pd.DataFrame([{str(i): "" for i in range(4)}] * 10)
 MAX_NUM_COLUMNS = 20
 css = """
-@media (prefers-color-scheme: dark) {
-    .transparent-dropdown, .transparent-dropdown .container .wrap  {
-        background: var(--bg-dark);
-    }
-}
-@media (prefers-color-scheme: light) {
-    .transparent-dropdown, .transparent-dropdown .container .wrap  {
-        background: var(--bg);
-    }
 }
 input {
   -webkit-user-select: none;
@@ -32,9 +29,25 @@ input {
 thead {
     display: none;
 }
 """
 js = """
-function setDataFrameReadonly() {
     MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
     var observer = new MutationObserver(function(mutations, observer) {
         // fired when a mutation occurs
@@ -46,38 +59,82 @@ function setDataFrameReadonly() {
         subtree: true,
         childList: true
     });
 }
 """
 text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
 @lru_cache(maxsize=3)
 def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
     return duckdb.sql(query)
-def prepare_function(func: str, placeholder: str, column_name: str) -> str:
-    if "(" in func:
-        prepared_func = func.split("(")
-        prepared_func[1] = prepared_func[1].replace(placeholder, column_name, 1)
-        prepared_func = "(".join(prepared_func)
     else:
-        prepared_func = func.replace(placeholder, column_name, 1)
-    return prepared_func
 with gr.Blocks(css=css, js=js) as demo:
     loading_codes_json = gr.JSON(visible=False)
     dataset_subset_split_textbox = gr.Textbox(visible=False)
-    input_dataframe = gr.DataFrame(visible=False)
     with gr.Group():
         with gr.Row():
-            dataset_dropdown = gr.Dropdown(label="Open Dataset", allow_custom_value=True, scale=10)
             subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
             split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
             gr.LoginButton()
         with gr.Row():
-            transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
-            transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
         dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
     def show_subset_dropdown(dataset: str):
         if dataset and "/" not in dataset.strip().strip("/"):
@@ -93,79 +150,127 @@ with gr.Blocks(css=css, js=js) as demo:
         split = (splits or [""])[0]
         return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
-    def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
         pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
         if dataset and subset and split and pattern:
-            df = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").df()
-            input_df = df
         else:
-            input_df = EMPTY_DF
-        new_transform_dropdowns = [dict(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in input_df.columns]
         new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
-        return [dict(value=df, column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
-    def set_dataframe(input_df: pd.DataFrame, *transforms: tuple[str], column_index: int):
         try:
-            return duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_df;").df()
         except Exception as e:
-            gr.Error(f"{type(e).__name__}: {e}")
-            return input_df
     for column_index, transform_dropdown in enumerate(transform_dropdowns):
-        transform_dropdown.select(partial(set_dataframe, column_index=column_index), inputs=[input_dataframe] + transform_dropdowns, outputs=dataframe)
-    @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
     def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
         api = HfApi(token=oauth_token.token if oauth_token else None)
-        datasets = list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"]))
         if oauth_token and (user := api.whoami().get("name")):
-            datasets += list(api.list_datasets(limit=3, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
         dataset = request.query_params.get("dataset") or datasets[0].id
         subsets, loading_codes = show_subset_dropdown(dataset)
         splits = show_split_dropdown(subsets["value"], loading_codes)
-        input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
         return {
             dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
             loading_codes_json: loading_codes,
             subset_dropdown: gr.Dropdown(**subsets),
             split_dropdown: gr.Dropdown(**splits),
-            input_dataframe: gr.DataFrame(**input_df),
-            dataframe: gr.DataFrame(**input_df),
-            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
         }
-    @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
     def _show_subset_dropdown(dataset: str):
         subsets, loading_codes = show_subset_dropdown(dataset)
         splits = show_split_dropdown(subsets["value"], loading_codes)
-        input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
         return {
             loading_codes_json: loading_codes,
             subset_dropdown: gr.Dropdown(**subsets),
             split_dropdown: gr.Dropdown(**splits),
-            input_dataframe: gr.DataFrame(**input_df),
-            dataframe: gr.DataFrame(**input_df),
-            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
         }
-    @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_dataframe, dataframe] + transform_dropdowns)
     def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
         splits = show_split_dropdown(subset, loading_codes)
-        input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
         return {
             split_dropdown: gr.Dropdown(**splits),
-            input_dataframe: gr.DataFrame(**input_df),
-            dataframe: gr.DataFrame(**input_df),
-            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
         }
-    @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_dataframe, dataframe] + transform_dropdowns)
     def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
-        input_df, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
         return {
-            input_dataframe: gr.DataFrame(**input_df),
-            dataframe: gr.DataFrame(**input_df),
-            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
         }

 import duckdb
 import gradio as gr
 import pandas as pd
+import pyarrow as pa
 import requests
 from huggingface_hub import HfApi
 READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
+EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
+EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
 MAX_NUM_COLUMNS = 20
+NUM_TRENDING_DATASETS = 10
+NUM_USER_DATASETS = 10
 css = """
+.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion  {
+    background: var(--body-background-fill);
 }
 input {
   -webkit-user-select: none;
 thead {
     display: none;
 }
+.secondary-wrap:has(input[aria-expanded="true"]) {
+    background: var(--table-odd-background-fill);
+}
+.secondary-wrap:has(input[aria-expanded="true"])::after {
+    content: '↵';
+    margin-right: var(--size-10);
+    border-width: 1px;
+    border-color: var(--block-border-color);
+    border-radius: .23rem;
+    background-color: #141c2e;
+    padding-left: 2px;
+    font-size: .75rem;
+    color: var(--block-title-text-color);
+}
+var(--body-background-fill)
 """
 js = """
+function load() {
+    // Set DataFrame readonly
     MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
     var observer = new MutationObserver(function(mutations, observer) {
         // fired when a mutation occurs
         subtree: true,
         childList: true
     });
+    // Run query on Enter in transform dropdown
+    document.querySelectorAll("input").forEach(i => {
+        if (i.parentElement.parentElement.parentElement.parentElement.parentElement.classList.contains("transform_dropdown")) {
+            i.onkeydown = (event) => {
+                if (event.code == "Enter") {
+                    document.getElementById("run_button").click();
+                }
+            }
+        }
+    })
 }
 """
 text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
+date_functions_df = pd.read_csv("date_functions.tsv", delimiter="\t")
+list_functions_df = pd.read_csv("list_functions.tsv", delimiter="\t")
+numeric_functions_df = pd.read_csv("numeric_functions.tsv", delimiter="\t")
+time_functions_df = pd.read_csv("time_functions.tsv", delimiter="\t")
+timestamp_functions_df = pd.read_csv("timestamp_functions.tsv", delimiter="\t")
 @lru_cache(maxsize=3)
 def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
     return duckdb.sql(query)
+def prepare_function(func: str, placeholders: list[str], column_name: str) -> str:
+    prepared_func = func.split("(", 1)
+    for placeholder in placeholders:
+        if placeholder in prepared_func[-1]:
+            prepared_func[-1] = prepared_func[-1].replace(placeholder, column_name, 1)
+            return "(".join(prepared_func)
     else:
+        return None
+def prettify_df(df: pd.DataFrame):
+    return df.apply(lambda s: s.apply(str))
+def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
+    prepared_functions = {}
+    for field in table.schema:
+        if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
+            prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
+        elif pa.types.is_string(field.type):
+            prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
+        elif pa.types.is_date(field.type):
+            prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
+        elif pa.types.is_list(field.type):
+            prepared_functions[field.name] = [prepare_function(list_func, ["list"], field.name) for list_func in list_functions_df.Name]
+        elif pa.types.is_time(field.type):
+            prepared_functions[field.name] = [prepare_function(time_func, ["starttime", "time"], field.name) for time_func in time_functions_df.Name]
+        elif pa.types.is_timestamp(field.type):
+            prepared_functions[field.name] = [prepare_function(timestamp_func, ["startdate", "timestamp"], field.name) for timestamp_func in timestamp_functions_df.Name]
+        elif pa.types.is_struct(field.type):
+            prepared_functions[field.name] = [f"{field.name}.{subfield.name}" for subfield in field.type.fields]
+        else:
+            prepared_functions[field.name] = []
+        prepared_functions[field.name] = [prepared_function for prepared_function in prepared_functions[field.name] if prepared_function]
+    return prepared_functions
 with gr.Blocks(css=css, js=js) as demo:
     loading_codes_json = gr.JSON(visible=False)
     dataset_subset_split_textbox = gr.Textbox(visible=False)
+    input_table_state = gr.State()
+    run_button = gr.Button(visible=False, elem_id="run_button")
+    gr.Markdown("# Dataset Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
     with gr.Group():
         with gr.Row():
+            dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
             subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
             split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
             gr.LoginButton()
         with gr.Row():
+            transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
+            transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
         dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
+        with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
+            code_markdown = gr.Markdown()
     def show_subset_dropdown(dataset: str):
         if dataset and "/" not in dataset.strip().strip("/"):
         split = (splits or [""])[0]
         return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
+    def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
         pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
         if dataset and subset and split and pattern:
+            table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").arrow()
         else:
+            table = EMPTY_TABLE
+        prepared_functions = get_prepared_functions_from_table(table)
+        new_transform_dropdowns = [dict(choices=[column_name] + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in table.column_names]
         new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
+        df = table.to_pandas()
+        return [table, dict(value=prettify_df(df), column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
+    def set_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], input_table: pa.Table, df: pd.DataFrame, *transforms, show_warning=True):
         try:
+            table = duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_table;").arrow()
         except Exception as e:
+            if show_warning:
+                gr.Warning(f"{type(e).__name__}: {e}")
+            return {
+                dataframe: df
+            }
+        prepared_functions = get_prepared_functions_from_table(table)
+        new_transform_dropdowns = [dict(choices=list({original_column_name: None, column_name: None}) + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for original_column_name, column_name in zip(input_table.column_names, table.column_names)]
+        new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
+        pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
+        return {
+            dataframe: prettify_df(table.to_pandas()),
+            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
+            code_markdown: (
+                "```sql\n"
+                + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
+                + f"FROM 'hf://datasets/{dataset}/{pattern}';"
+                + "\n```"
+            ) if pattern else "",
+        }
     for column_index, transform_dropdown in enumerate(transform_dropdowns):
+        transform_dropdown.select(partial(set_dataframe, show_warning=False), inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
+    run_button.click(set_dataframe, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
+    @demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
     def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
         api = HfApi(token=oauth_token.token if oauth_token else None)
+        datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
         if oauth_token and (user := api.whoami().get("name")):
+            datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
         dataset = request.query_params.get("dataset") or datasets[0].id
         subsets, loading_codes = show_subset_dropdown(dataset)
         splits = show_split_dropdown(subsets["value"], loading_codes)
+        input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
+        pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
         return {
             dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
             loading_codes_json: loading_codes,
             subset_dropdown: gr.Dropdown(**subsets),
             split_dropdown: gr.Dropdown(**splits),
+            input_table_state: input_table,
+            dataframe: gr.DataFrame(**input_dataframe),
+            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
+            code_markdown: (
+                "```sql\n"
+                + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
+                + f"FROM 'hf://datasets/{dataset}/{pattern}';"
+                + "\n```"
+            ) if pattern else "",
         }
+    @dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
     def _show_subset_dropdown(dataset: str):
         subsets, loading_codes = show_subset_dropdown(dataset)
         splits = show_split_dropdown(subsets["value"], loading_codes)
+        input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
+        pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
         return {
             loading_codes_json: loading_codes,
             subset_dropdown: gr.Dropdown(**subsets),
             split_dropdown: gr.Dropdown(**splits),
+            input_table_state: input_table,
+            dataframe: gr.DataFrame(**input_dataframe),
+            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
+            code_markdown: (
+                "```sql\n"
+                + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
+                + f"FROM 'hf://datasets/{dataset}/{pattern}';"
+                + "\n```"
+            ) if pattern else "",
         }
+    @subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
     def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
         splits = show_split_dropdown(subset, loading_codes)
+        input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
+        pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
         return {
             split_dropdown: gr.Dropdown(**splits),
+            input_table_state: input_table,
+            dataframe: gr.DataFrame(**input_dataframe),
+            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
+            code_markdown: (
+                "```sql\n"
+                + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
+                + f"FROM 'hf://datasets/{dataset}/{pattern}';"
+                + "\n```"
+            ) if pattern else "",
         }
+    @split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_table_state, dataframe, code_markdown] + transform_dropdowns)
     def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
+        input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
+        pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
         return {
+            input_table_state: input_table,
+            dataframe: gr.DataFrame(**input_dataframe),
+            **dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
+            code_markdown: (
+                "```sql\n"
+                + f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
+                + f"FROM 'hf://datasets/{dataset}/{pattern}';"
+                + "\n```"
+            ) if pattern else "",
         }

date_functions.tsv ADDED Viewed

	@@ -0,0 +1,24 @@

+Name	Description
+current_date	Current date (at start of current transaction).
+date_add(date, interval)	Add the interval to the date.
+date_diff(part, startdate, enddate)	The number of partition boundaries between the dates.
+date_part(part, date)	Get the subfield (equivalent to extract).
+date_sub(part, startdate, enddate)	The number of complete partitions between the dates.
+date_trunc(part, date)	Truncate to specified precision.
+datediff(part, startdate, enddate)	The number of partition boundaries between the dates. Alias of date_diff.
+datepart(part, date)	Get the subfield (equivalent to extract). Alias of date_part.
+datesub(part, startdate, enddate)	The number of complete partitions between the dates. Alias of date_sub.
+datetrunc(part, date)	Truncate to specified precision. Alias of date_trunc.
+dayname(date)	The (English) name of the weekday.
+extract(part from date)	Get subfield from a date.
+greatest(date, date)	The later of two dates.
+isfinite(date)	Returns true if the date is finite, false otherwise.
+isinf(date)	Returns true if the date is infinite, false otherwise.
+last_day(date)	The last day of the corresponding month in the date.
+least(date, date)	The earlier of two dates.
+make_date(year, month, day)	The date for the given parts.
+monthname(date)	The (English) name of the month.
+strftime(date, format)	Converts a date to a string according to the format string.
+time_bucket(bucket_width, date[, offset])	Truncate date by the specified interval bucket_width. Buckets are offset by offset interval.
+time_bucket(bucket_width, date[, origin])	Truncate date by the specified interval bucket_width. Buckets are aligned relative to origin date. origin defaults to 2000-01-03 for buckets that don't include a month or year interval, and to 2000-01-01 for month and year buckets.
+today()	Current date (start of current transaction).

list_functions.tsv ADDED Viewed

	@@ -0,0 +1,41 @@

+Name	Description
+list[index]	Bracket notation serves as an alias for list_extract.
+list[begin:end]	Bracket notation with colon is an alias for list_slice.
+list[begin:end:step]	list_slice in bracket notation with an added step feature.
+array_pop_back(list)	Returns the list without the last element.
+array_pop_front(list)	Returns the list without the first element.
+flatten(list_of_lists)	Concatenate a list of lists into a single list. This only flattens one level of the list (see examples).
+len(list)	Return the length of the list.
+list_aggregate(list, name)	Executes the aggregate function name on the elements of list. See the List Aggregates section for more details.
+list_any_value(list)	Returns the first non-null value in the list.
+list_append(list, element)	Appends element to list.
+list_concat(list1, list2)	Concatenate two lists. NULL inputs are skipped. See also ||
+list_contains(list, element)	Returns true if the list contains the element.
+list_cosine_similarity(list1, list2)	Compute the cosine similarity between two lists.
+list_cosine_distance(list1, list2)	Compute the cosine distance between two lists. Equivalent to 1.0 - list_cosine_similarity.
+list_distance(list1, list2)	Calculates the Euclidean distance between two points with coordinates given in two inputs lists of equal length.
+list_distinct(list)	Removes all duplicates and NULL values from a list. Does not preserve the original order.
+list_dot_product(list1, list2)	Computes the dot product of two same-sized lists of numbers.
+list_negative_dot_product(list1, list2)	Computes the negative dot product of two same-sized lists of numbers. Equivalent to - list_dot_product.
+list_extract(list, index)	Extract the indexth (1-based) value from the list.
+list_filter(list, lambda)	Constructs a list from those elements of the input list for which the lambda function returns true. See the Lambda Functions page for more details.
+list_grade_up(list)	Works like sort, but the results are the indexes that correspond to the position in the original list instead of the actual values.
+list_has_all(list, sub-list)	Returns true if all elements of sub-list exist in list.
+list_has_any(list1, list2)	Returns true if any elements exist is both lists.
+list_intersect(list1, list2)	Returns a list of all the elements that exist in both l1 and l2, without duplicates.
+list_position(list, element)	Returns the index of the element if the list contains the element. If the element is not found, it returns NULL.
+list_prepend(element, list)	Prepends element to list.
+list_reduce(list, lambda)	Returns a single value that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
+list_resize(list, size[, value])	Resizes the list to contain size elements. Initializes new elements with value or NULL if value is not set.
+list_reverse_sort(list)	Sorts the elements of the list in reverse order. See the Sorting Lists section for more details about the NULL sorting order.
+list_reverse(list)	Reverses the list.
+list_select(value_list, index_list)	Returns a list based on the elements selected by the index_list.
+list_slice(list, begin, end, step)	list_slice with added step feature.
+list_slice(list, begin, end)	Extract a sublist using slice conventions. Negative values are accepted. See slicing.
+list_sort(list)	Sorts the elements of the list. See the Sorting Lists section for more details about the sorting order and the NULL sorting order.
+list_transform(list, lambda)	Returns a list that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
+list_unique(list)	Counts the unique elements of a list.
+list_value(any, ...)	Create a LIST containing the argument values.
+list_where(value_list, mask_list)	Returns a list with the BOOLEANs in mask_list applied as a mask to the value_list.
+list_zip(list_1, list_2, ...[, truncate])	Zips k LISTs to a new LIST whose length will be that of the longest list. Its elements are structs of k elements from each list list_1, …, list_k, missing elements are replaced with NULL. If truncate is set, all lists are truncated to the smallest list length.
+unnest(list)	Unnests a list by one level. Note that this is a special function that alters the cardinality of the result. See the unnest page for more details.

numeric_functions.tsv ADDED Viewed

	@@ -0,0 +1,55 @@

+Name	Description
+@(x)	Absolute value. Parentheses are optional if x is a column name.
+abs(x)	Absolute value.
+acos(x)	Computes the arccosine of x.
+add(x, y)	Alias for x + y.
+asin(x)	Computes the arcsine of x.
+atan(x)	Computes the arctangent of x.
+atan2(y, x)	Computes the arctangent (y, x).
+bit_count(x)	Returns the number of bits that are set.
+cbrt(x)	Returns the cube root of the number.
+ceil(x)	Rounds the number up.
+ceiling(x)	Rounds the number up. Alias of ceil.
+cos(x)	Computes the cosine of x.
+cot(x)	Computes the cotangent of x.
+degrees(x)	Converts radians to degrees.
+divide(x, y)	Alias for x // y.
+even(x)	Round to next even number by rounding away from zero.
+exp(x)	Computes e ** x.
+factorial(x)	See ! operator. Computes the product of the current integer and all integers below it.
+fdiv(x, y)	Performs integer division (x // y) but returns a DOUBLE value.
+floor(x)	Rounds the number down.
+fmod(x, y)	Calculates the modulo value. Always returns a DOUBLE value.
+gamma(x)	Interpolation of the factorial of x - 1. Fractional inputs are allowed.
+gcd(x, y)	Computes the greatest common divisor of x and y.
+greatest_common_divisor(x, y)	Computes the greatest common divisor of x and y.
+greatest(x1, x2, ...)	Selects the largest value.
+isfinite(x)	Returns true if the floating point value is finite, false otherwise.
+isinf(x)	Returns true if the floating point value is infinite, false otherwise.
+isnan(x)	Returns true if the floating point value is not a number, false otherwise.
+lcm(x, y)	Computes the least common multiple of x and y.
+least_common_multiple(x, y)	Computes the least common multiple of x and y.
+least(x1, x2, ...)	Selects the smallest value.
+lgamma(x)	Computes the log of the gamma function.
+ln(x)	Computes the natural logarithm of x.
+log(x)	Computes the base-10 logarithm of x.
+log10(x)	Alias of log. Computes the base-10 logarithm of x.
+log2(x)	Computes the base-2 log of x.
+multiply(x, y)	Alias for x * y.
+nextafter(x, y)	Return the next floating point value after x in the direction of y.
+pi()	Returns the value of pi.
+pow(x, y)	Computes x to the power of y.
+power(x, y)	Alias of pow. computes x to the power of y.
+radians(x)	Converts degrees to radians.
+random()	Returns a random number x in the range 0.0 <= x < 1.0.
+round_even(v NUMERIC, s INTEGER)	Alias of roundbankers(v, s). Round to s decimal places using the rounding half to even rule. Values s < 0 are allowed.
+round(v NUMERIC, s INTEGER)	Round to s decimal places. Values s < 0 are allowed.
+setseed(x)	Sets the seed to be used for the random function.
+sign(x)	Returns the sign of x as -1, 0 or 1.
+signbit(x)	Returns whether the signbit is set or not.
+sin(x)	Computes the sin of x.
+sqrt(x)	Returns the square root of the number.
+subtract(x, y)	Alias for x - y.
+tan(x)	Computes the tangent of x.
+trunc(x)	Truncates the number.
+xor(x, y)	Bitwise XOR.

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@

1	duckdb


1	+ pyarrow
2	duckdb

time_functions.tsv ADDED Viewed

	@@ -0,0 +1,11 @@

+Name	Description
+current_time	Current time (start of current transaction).
+date_diff(part, starttime, endtime)	The number of partition boundaries between the times.
+date_part(part, time)	Get subfield (equivalent to extract).
+date_sub(part, starttime, endtime)	The number of complete partitions between the times.
+datediff(part, starttime, endtime)	Alias of date_diff. The number of partition boundaries between the times.
+datepart(part, time)	Alias of date_part. Get subfield (equivalent to extract).
+datesub(part, starttime, endtime)	Alias of date_sub. The number of complete partitions between the times.
+extract(part FROM time)	Get subfield from a time.
+get_current_time()	Current time (start of current transaction).
+make_time(bigint, bigint, double)	The time for the given parts.

timestamp_functions.tsv ADDED Viewed

	@@ -0,0 +1,39 @@

+Name	Description
+age(timestamp, timestamp)	Subtract arguments, resulting in the time difference between the two timestamps.
+age(timestamp)	Subtract from current_date.
+century(timestamp)	Extracts the century of a timestamp.
+current_timestamp	Returns the current timestamp (at the start of the transaction).
+date_diff(part, startdate, enddate)	The number of partition boundaries between the timestamps.
+date_part([part, ...], timestamp)	Get the listed subfields as a struct. The list must be constant.
+date_part(part, timestamp)	Get subfield (equivalent to extract).
+date_sub(part, startdate, enddate)	The number of complete partitions between the timestamps.
+date_trunc(part, timestamp)	Truncate to specified precision.
+datediff(part, startdate, enddate)	Alias of date_diff. The number of partition boundaries between the timestamps.
+datepart([part, ...], timestamp)	Alias of date_part. Get the listed subfields as a struct. The list must be constant.
+datepart(part, timestamp)	Alias of date_part. Get subfield (equivalent to extract).
+datesub(part, startdate, enddate)	Alias of date_sub. The number of complete partitions between the timestamps.
+datetrunc(part, timestamp)	Alias of date_trunc. Truncate to specified precision.
+dayname(timestamp)	The (English) name of the weekday.
+epoch_ms(ms)	Converts ms since epoch to a timestamp.
+epoch_ms(timestamp)	Converts a timestamp to milliseconds since the epoch.
+epoch_ms(timestamp)	Return the total number of milliseconds since the epoch.
+epoch_ns(timestamp)	Return the total number of nanoseconds since the epoch.
+epoch_us(timestamp)	Return the total number of microseconds since the epoch.
+epoch(timestamp)	Converts a timestamp to seconds since the epoch.
+extract(field FROM timestamp)	Get subfield from a timestamp.
+greatest(timestamp, timestamp)	The later of two timestamps.
+isfinite(timestamp)	Returns true if the timestamp is finite, false otherwise.
+isinf(timestamp)	Returns true if the timestamp is infinite, false otherwise.
+last_day(timestamp)	The last day of the month.
+least(timestamp, timestamp)	The earlier of two timestamps.
+make_timestamp(bigint, bigint, bigint, bigint, bigint, double)	The timestamp for the given parts.
+make_timestamp(microseconds)	The timestamp for the given number of µs since the epoch.
+monthname(timestamp)	The (English) name of the month.
+strftime(timestamp, format)	Converts timestamp to string according to the format string.
+strptime(text, format-list)	Converts the string text to timestamp applying the format strings in the list until one succeeds. Throws an error on failure. To return NULL on failure, use try_strptime.
+strptime(text, format)	Converts the string text to timestamp according to the format string. Throws an error on failure. To return NULL on failure, use try_strptime.
+time_bucket(bucket_width, timestamp[, offset])	Truncate timestamp by the specified interval bucket_width. Buckets are offset by offset interval.
+time_bucket(bucket_width, timestamp[, origin])	Truncate timestamp by the specified interval bucket_width. Buckets are aligned relative to origin timestamp. origin defaults to 2000-01-03 00:00:00 for buckets that don't include a month or year interval, and to 2000-01-01 00:00:00 for month and year buckets.
+to_timestamp(double)	Converts seconds since the epoch to a timestamp with time zone.
+try_strptime(text, format-list)	Converts the string text to timestamp applying the format strings in the list until one succeeds. Returns NULL on failure.
+try_strptime(text, format)	Converts the string text to timestamp according to the format string. Returns NULL on failure.