Spaces:
Running
Running
add all functions
Browse files- app.py +162 -57
- date_functions.tsv +24 -0
- list_functions.tsv +41 -0
- numeric_functions.tsv +55 -0
- requirements.txt +1 -0
- time_functions.tsv +11 -0
- timestamp_functions.tsv +39 -0
app.py
CHANGED
@@ -3,22 +3,19 @@ from functools import partial, lru_cache
|
|
3 |
import duckdb
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
|
|
6 |
import requests
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
10 |
-
|
|
|
11 |
MAX_NUM_COLUMNS = 20
|
|
|
|
|
12 |
css = """
|
13 |
-
|
14 |
-
|
15 |
-
background: var(--bg-dark);
|
16 |
-
}
|
17 |
-
}
|
18 |
-
@media (prefers-color-scheme: light) {
|
19 |
-
.transparent-dropdown, .transparent-dropdown .container .wrap {
|
20 |
-
background: var(--bg);
|
21 |
-
}
|
22 |
}
|
23 |
input {
|
24 |
-webkit-user-select: none;
|
@@ -32,9 +29,25 @@ input {
|
|
32 |
thead {
|
33 |
display: none;
|
34 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"""
|
36 |
js = """
|
37 |
-
function
|
|
|
38 |
MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
|
39 |
var observer = new MutationObserver(function(mutations, observer) {
|
40 |
// fired when a mutation occurs
|
@@ -46,38 +59,82 @@ function setDataFrameReadonly() {
|
|
46 |
subtree: true,
|
47 |
childList: true
|
48 |
});
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
}
|
51 |
"""
|
52 |
text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
@lru_cache(maxsize=3)
|
55 |
def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
|
56 |
return duckdb.sql(query)
|
57 |
|
58 |
-
def prepare_function(func: str,
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
else:
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
with gr.Blocks(css=css, js=js) as demo:
|
68 |
loading_codes_json = gr.JSON(visible=False)
|
69 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
70 |
-
|
|
|
|
|
71 |
with gr.Group():
|
72 |
with gr.Row():
|
73 |
-
dataset_dropdown = gr.Dropdown(label="
|
74 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
75 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
76 |
gr.LoginButton()
|
77 |
with gr.Row():
|
78 |
-
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in EMPTY_DF.columns]
|
79 |
-
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
80 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
|
|
|
|
81 |
|
82 |
def show_subset_dropdown(dataset: str):
|
83 |
if dataset and "/" not in dataset.strip().strip("/"):
|
@@ -93,79 +150,127 @@ with gr.Blocks(css=css, js=js) as demo:
|
|
93 |
split = (splits or [""])[0]
|
94 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
95 |
|
96 |
-
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict])
|
97 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
98 |
if dataset and subset and split and pattern:
|
99 |
-
|
100 |
-
input_df = df
|
101 |
else:
|
102 |
-
|
103 |
-
|
|
|
104 |
new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
|
105 |
-
|
|
|
106 |
|
107 |
-
def set_dataframe(
|
108 |
try:
|
109 |
-
|
110 |
except Exception as e:
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
for column_index, transform_dropdown in enumerate(transform_dropdowns):
|
115 |
-
transform_dropdown.select(partial(set_dataframe,
|
116 |
|
117 |
-
|
|
|
|
|
118 |
def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
|
119 |
api = HfApi(token=oauth_token.token if oauth_token else None)
|
120 |
-
datasets = list(api.list_datasets(limit=
|
121 |
if oauth_token and (user := api.whoami().get("name")):
|
122 |
-
datasets += list(api.list_datasets(limit=
|
123 |
dataset = request.query_params.get("dataset") or datasets[0].id
|
124 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
125 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
126 |
-
|
|
|
127 |
return {
|
128 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
129 |
loading_codes_json: loading_codes,
|
130 |
subset_dropdown: gr.Dropdown(**subsets),
|
131 |
split_dropdown: gr.Dropdown(**splits),
|
132 |
-
|
133 |
-
dataframe: gr.DataFrame(**
|
134 |
-
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
}
|
136 |
|
137 |
-
@dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown,
|
138 |
def _show_subset_dropdown(dataset: str):
|
139 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
140 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
141 |
-
|
|
|
142 |
return {
|
143 |
loading_codes_json: loading_codes,
|
144 |
subset_dropdown: gr.Dropdown(**subsets),
|
145 |
split_dropdown: gr.Dropdown(**splits),
|
146 |
-
|
147 |
-
dataframe: gr.DataFrame(**
|
148 |
-
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
}
|
150 |
|
151 |
-
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown,
|
152 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
153 |
splits = show_split_dropdown(subset, loading_codes)
|
154 |
-
|
|
|
155 |
return {
|
156 |
split_dropdown: gr.Dropdown(**splits),
|
157 |
-
|
158 |
-
dataframe: gr.DataFrame(**
|
159 |
-
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
}
|
161 |
|
162 |
-
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[
|
163 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
164 |
-
|
|
|
165 |
return {
|
166 |
-
|
167 |
-
dataframe: gr.DataFrame(**
|
168 |
-
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns]))
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
}
|
170 |
|
171 |
|
|
|
3 |
import duckdb
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
+
import pyarrow as pa
|
7 |
import requests
|
8 |
from huggingface_hub import HfApi
|
9 |
|
10 |
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
|
11 |
+
EMPTY_TABLE = pa.Table.from_pylist([{str(i): "" for i in range(4)}] * 10)
|
12 |
+
EMPTY_DF: pd.DataFrame = EMPTY_TABLE.to_pandas()
|
13 |
MAX_NUM_COLUMNS = 20
|
14 |
+
NUM_TRENDING_DATASETS = 10
|
15 |
+
NUM_USER_DATASETS = 10
|
16 |
css = """
|
17 |
+
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
|
18 |
+
background: var(--body-background-fill);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
}
|
20 |
input {
|
21 |
-webkit-user-select: none;
|
|
|
29 |
thead {
|
30 |
display: none;
|
31 |
}
|
32 |
+
.secondary-wrap:has(input[aria-expanded="true"]) {
|
33 |
+
background: var(--table-odd-background-fill);
|
34 |
+
}
|
35 |
+
.secondary-wrap:has(input[aria-expanded="true"])::after {
|
36 |
+
content: '↵';
|
37 |
+
margin-right: var(--size-10);
|
38 |
+
border-width: 1px;
|
39 |
+
border-color: var(--block-border-color);
|
40 |
+
border-radius: .23rem;
|
41 |
+
background-color: #141c2e;
|
42 |
+
padding-left: 2px;
|
43 |
+
font-size: .75rem;
|
44 |
+
color: var(--block-title-text-color);
|
45 |
+
}
|
46 |
+
var(--body-background-fill)
|
47 |
"""
|
48 |
js = """
|
49 |
+
function load() {
|
50 |
+
// Set DataFrame readonly
|
51 |
MutationObserver = window.MutationObserver || window.WebKitMutationObserver;
|
52 |
var observer = new MutationObserver(function(mutations, observer) {
|
53 |
// fired when a mutation occurs
|
|
|
59 |
subtree: true,
|
60 |
childList: true
|
61 |
});
|
62 |
+
|
63 |
+
// Run query on Enter in transform dropdown
|
64 |
+
document.querySelectorAll("input").forEach(i => {
|
65 |
+
if (i.parentElement.parentElement.parentElement.parentElement.parentElement.classList.contains("transform_dropdown")) {
|
66 |
+
i.onkeydown = (event) => {
|
67 |
+
if (event.code == "Enter") {
|
68 |
+
document.getElementById("run_button").click();
|
69 |
+
}
|
70 |
+
}
|
71 |
+
}
|
72 |
+
})
|
73 |
}
|
74 |
"""
|
75 |
text_functions_df = pd.read_csv("text_functions.tsv", delimiter="\t")
|
76 |
+
date_functions_df = pd.read_csv("date_functions.tsv", delimiter="\t")
|
77 |
+
list_functions_df = pd.read_csv("list_functions.tsv", delimiter="\t")
|
78 |
+
numeric_functions_df = pd.read_csv("numeric_functions.tsv", delimiter="\t")
|
79 |
+
time_functions_df = pd.read_csv("time_functions.tsv", delimiter="\t")
|
80 |
+
timestamp_functions_df = pd.read_csv("timestamp_functions.tsv", delimiter="\t")
|
81 |
|
82 |
@lru_cache(maxsize=3)
|
83 |
def duckdb_sql(query: str) -> duckdb.DuckDBPyRelation:
|
84 |
return duckdb.sql(query)
|
85 |
|
86 |
+
def prepare_function(func: str, placeholders: list[str], column_name: str) -> str:
|
87 |
+
prepared_func = func.split("(", 1)
|
88 |
+
for placeholder in placeholders:
|
89 |
+
if placeholder in prepared_func[-1]:
|
90 |
+
prepared_func[-1] = prepared_func[-1].replace(placeholder, column_name, 1)
|
91 |
+
return "(".join(prepared_func)
|
92 |
else:
|
93 |
+
return None
|
94 |
+
|
95 |
+
def prettify_df(df: pd.DataFrame):
|
96 |
+
return df.apply(lambda s: s.apply(str))
|
97 |
+
|
98 |
+
def get_prepared_functions_from_table(table: pa.Table) -> dict[str, list[str]]:
|
99 |
+
prepared_functions = {}
|
100 |
+
for field in table.schema:
|
101 |
+
if pa.types.is_integer(field.type) or pa.types.is_floating(field.type):
|
102 |
+
prepared_functions[field.name] = [prepare_function(numeric_func, ["x"], field.name) for numeric_func in numeric_functions_df.Name]
|
103 |
+
elif pa.types.is_string(field.type):
|
104 |
+
prepared_functions[field.name] = [prepare_function(text_func, ["string"], field.name) for text_func in text_functions_df.Name]
|
105 |
+
elif pa.types.is_date(field.type):
|
106 |
+
prepared_functions[field.name] = [prepare_function(date_func, ["startdate", "date"], field.name) for date_func in date_functions_df.Name]
|
107 |
+
elif pa.types.is_list(field.type):
|
108 |
+
prepared_functions[field.name] = [prepare_function(list_func, ["list"], field.name) for list_func in list_functions_df.Name]
|
109 |
+
elif pa.types.is_time(field.type):
|
110 |
+
prepared_functions[field.name] = [prepare_function(time_func, ["starttime", "time"], field.name) for time_func in time_functions_df.Name]
|
111 |
+
elif pa.types.is_timestamp(field.type):
|
112 |
+
prepared_functions[field.name] = [prepare_function(timestamp_func, ["startdate", "timestamp"], field.name) for timestamp_func in timestamp_functions_df.Name]
|
113 |
+
elif pa.types.is_struct(field.type):
|
114 |
+
prepared_functions[field.name] = [f"{field.name}.{subfield.name}" for subfield in field.type.fields]
|
115 |
+
else:
|
116 |
+
prepared_functions[field.name] = []
|
117 |
+
prepared_functions[field.name] = [prepared_function for prepared_function in prepared_functions[field.name] if prepared_function]
|
118 |
+
return prepared_functions
|
119 |
|
120 |
with gr.Blocks(css=css, js=js) as demo:
|
121 |
loading_codes_json = gr.JSON(visible=False)
|
122 |
dataset_subset_split_textbox = gr.Textbox(visible=False)
|
123 |
+
input_table_state = gr.State()
|
124 |
+
run_button = gr.Button(visible=False, elem_id="run_button")
|
125 |
+
gr.Markdown("# Dataset Spreadsheets\n\nEdit any dataset on Hugging Face (full list [here](https://huggingface.co/datasets)) using DuckDB functions (documentation [here](https://duckdb.org/docs/sql/functions/overview))")
|
126 |
with gr.Group():
|
127 |
with gr.Row():
|
128 |
+
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
|
129 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
130 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False, elem_classes="transparent-dropdown")
|
131 |
gr.LoginButton()
|
132 |
with gr.Row():
|
133 |
+
transform_dropdowns = [gr.Dropdown(choices=[column_name] + [prepare_function(text_func, "string", column_name) for text_func in text_functions_df.Name if "string" in text_func], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True, elem_classes="transform_dropdown") for column_name in EMPTY_DF.columns]
|
134 |
+
transform_dropdowns += [gr.Dropdown(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False, elem_classes="transform_dropdown") for _ in range(MAX_NUM_COLUMNS - len(transform_dropdowns))]
|
135 |
dataframe = gr.DataFrame(EMPTY_DF, column_widths=[f"{1/len(EMPTY_DF.columns):.0%}"] * len(EMPTY_DF.columns), interactive=True, elem_classes="readonly-dataframe")
|
136 |
+
with gr.Accordion("Show SQL command", open=False, elem_classes="transparent-accordion"):
|
137 |
+
code_markdown = gr.Markdown()
|
138 |
|
139 |
def show_subset_dropdown(dataset: str):
|
140 |
if dataset and "/" not in dataset.strip().strip("/"):
|
|
|
150 |
split = (splits or [""])[0]
|
151 |
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
|
152 |
|
153 |
+
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]):
|
154 |
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
155 |
if dataset and subset and split and pattern:
|
156 |
+
table = duckdb_sql(f"SELECT * FROM 'hf://datasets/{dataset}/{pattern}' LIMIT 10").arrow()
|
|
|
157 |
else:
|
158 |
+
table = EMPTY_TABLE
|
159 |
+
prepared_functions = get_prepared_functions_from_table(table)
|
160 |
+
new_transform_dropdowns = [dict(choices=[column_name] + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for column_name in table.column_names]
|
161 |
new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
|
162 |
+
df = table.to_pandas()
|
163 |
+
return [table, dict(value=prettify_df(df), column_widths=[f"{1/len(df.columns):.0%}"] * len(df.columns))] + new_transform_dropdowns
|
164 |
|
165 |
+
def set_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], input_table: pa.Table, df: pd.DataFrame, *transforms, show_warning=True):
|
166 |
try:
|
167 |
+
table = duckdb.sql(f"SELECT {', '.join(transform for transform in transforms if transform)} FROM input_table;").arrow()
|
168 |
except Exception as e:
|
169 |
+
if show_warning:
|
170 |
+
gr.Warning(f"{type(e).__name__}: {e}")
|
171 |
+
return {
|
172 |
+
dataframe: df
|
173 |
+
}
|
174 |
+
prepared_functions = get_prepared_functions_from_table(table)
|
175 |
+
new_transform_dropdowns = [dict(choices=list({original_column_name: None, column_name: None}) + prepared_functions[column_name], value=column_name, container=False, interactive=True, allow_custom_value=True, visible=True) for original_column_name, column_name in zip(input_table.column_names, table.column_names)]
|
176 |
+
new_transform_dropdowns += [dict(choices=[None], value=None, container=False, interactive=True, allow_custom_value=True, visible=False) for _ in range(MAX_NUM_COLUMNS - len(new_transform_dropdowns))]
|
177 |
+
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
178 |
+
return {
|
179 |
+
dataframe: prettify_df(table.to_pandas()),
|
180 |
+
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
|
181 |
+
code_markdown: (
|
182 |
+
"```sql\n"
|
183 |
+
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
184 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
185 |
+
+ "\n```"
|
186 |
+
) if pattern else "",
|
187 |
+
}
|
188 |
+
|
189 |
for column_index, transform_dropdown in enumerate(transform_dropdowns):
|
190 |
+
transform_dropdown.select(partial(set_dataframe, show_warning=False), inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
|
191 |
|
192 |
+
run_button.click(set_dataframe, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, input_table_state, dataframe] + transform_dropdowns, outputs=[dataframe, code_markdown] + transform_dropdowns)
|
193 |
+
|
194 |
+
@demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
|
195 |
def _fetch_datasets(request: gr.Request, oauth_token: gr.OAuthToken | None):
|
196 |
api = HfApi(token=oauth_token.token if oauth_token else None)
|
197 |
+
datasets = list(api.list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]))
|
198 |
if oauth_token and (user := api.whoami().get("name")):
|
199 |
+
datasets += list(api.list_datasets(limit=NUM_USER_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"], author=user))
|
200 |
dataset = request.query_params.get("dataset") or datasets[0].id
|
201 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
202 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
203 |
+
input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
|
204 |
+
pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
|
205 |
return {
|
206 |
dataset_dropdown: gr.Dropdown(choices=[dataset.id for dataset in datasets], value=dataset),
|
207 |
loading_codes_json: loading_codes,
|
208 |
subset_dropdown: gr.Dropdown(**subsets),
|
209 |
split_dropdown: gr.Dropdown(**splits),
|
210 |
+
input_table_state: input_table,
|
211 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
212 |
+
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
|
213 |
+
code_markdown: (
|
214 |
+
"```sql\n"
|
215 |
+
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
216 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
217 |
+
+ "\n```"
|
218 |
+
) if pattern else "",
|
219 |
}
|
220 |
|
221 |
+
@dataset_dropdown.select(inputs=dataset_dropdown, outputs=[loading_codes_json, subset_dropdown, split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
|
222 |
def _show_subset_dropdown(dataset: str):
|
223 |
subsets, loading_codes = show_subset_dropdown(dataset)
|
224 |
splits = show_split_dropdown(subsets["value"], loading_codes)
|
225 |
+
input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes)
|
226 |
+
pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subsets["value"]] or [None])[0]
|
227 |
return {
|
228 |
loading_codes_json: loading_codes,
|
229 |
subset_dropdown: gr.Dropdown(**subsets),
|
230 |
split_dropdown: gr.Dropdown(**splits),
|
231 |
+
input_table_state: input_table,
|
232 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
233 |
+
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
|
234 |
+
code_markdown: (
|
235 |
+
"```sql\n"
|
236 |
+
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
237 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
238 |
+
+ "\n```"
|
239 |
+
) if pattern else "",
|
240 |
}
|
241 |
|
242 |
+
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown, input_table_state, dataframe, code_markdown] + transform_dropdowns)
|
243 |
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
|
244 |
splits = show_split_dropdown(subset, loading_codes)
|
245 |
+
input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, splits["value"], loading_codes)
|
246 |
+
pattern = ([loading_code["arguments"]["splits"][splits["value"]] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
247 |
return {
|
248 |
split_dropdown: gr.Dropdown(**splits),
|
249 |
+
input_table_state: input_table,
|
250 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
251 |
+
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
|
252 |
+
code_markdown: (
|
253 |
+
"```sql\n"
|
254 |
+
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
255 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
256 |
+
+ "\n```"
|
257 |
+
) if pattern else "",
|
258 |
}
|
259 |
|
260 |
+
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[input_table_state, dataframe, code_markdown] + transform_dropdowns)
|
261 |
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
|
262 |
+
input_table, input_dataframe, *new_transform_dropdowns = show_input_dataframe(dataset, subset, split, loading_codes)
|
263 |
+
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
|
264 |
return {
|
265 |
+
input_table_state: input_table,
|
266 |
+
dataframe: gr.DataFrame(**input_dataframe),
|
267 |
+
**dict(zip(transform_dropdowns, [gr.Dropdown(**new_transform_dropdown) for new_transform_dropdown in new_transform_dropdowns])),
|
268 |
+
code_markdown: (
|
269 |
+
"```sql\n"
|
270 |
+
+ f"SELECT {', '.join(new_transform_dropdown['value'] for new_transform_dropdown in new_transform_dropdowns if new_transform_dropdown['value'])} "
|
271 |
+
+ f"FROM 'hf://datasets/{dataset}/{pattern}';"
|
272 |
+
+ "\n```"
|
273 |
+
) if pattern else "",
|
274 |
}
|
275 |
|
276 |
|
date_functions.tsv
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
current_date Current date (at start of current transaction).
|
3 |
+
date_add(date, interval) Add the interval to the date.
|
4 |
+
date_diff(part, startdate, enddate) The number of partition boundaries between the dates.
|
5 |
+
date_part(part, date) Get the subfield (equivalent to extract).
|
6 |
+
date_sub(part, startdate, enddate) The number of complete partitions between the dates.
|
7 |
+
date_trunc(part, date) Truncate to specified precision.
|
8 |
+
datediff(part, startdate, enddate) The number of partition boundaries between the dates. Alias of date_diff.
|
9 |
+
datepart(part, date) Get the subfield (equivalent to extract). Alias of date_part.
|
10 |
+
datesub(part, startdate, enddate) The number of complete partitions between the dates. Alias of date_sub.
|
11 |
+
datetrunc(part, date) Truncate to specified precision. Alias of date_trunc.
|
12 |
+
dayname(date) The (English) name of the weekday.
|
13 |
+
extract(part from date) Get subfield from a date.
|
14 |
+
greatest(date, date) The later of two dates.
|
15 |
+
isfinite(date) Returns true if the date is finite, false otherwise.
|
16 |
+
isinf(date) Returns true if the date is infinite, false otherwise.
|
17 |
+
last_day(date) The last day of the corresponding month in the date.
|
18 |
+
least(date, date) The earlier of two dates.
|
19 |
+
make_date(year, month, day) The date for the given parts.
|
20 |
+
monthname(date) The (English) name of the month.
|
21 |
+
strftime(date, format) Converts a date to a string according to the format string.
|
22 |
+
time_bucket(bucket_width, date[, offset]) Truncate date by the specified interval bucket_width. Buckets are offset by offset interval.
|
23 |
+
time_bucket(bucket_width, date[, origin]) Truncate date by the specified interval bucket_width. Buckets are aligned relative to origin date. origin defaults to 2000-01-03 for buckets that don't include a month or year interval, and to 2000-01-01 for month and year buckets.
|
24 |
+
today() Current date (start of current transaction).
|
list_functions.tsv
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
list[index] Bracket notation serves as an alias for list_extract.
|
3 |
+
list[begin:end] Bracket notation with colon is an alias for list_slice.
|
4 |
+
list[begin:end:step] list_slice in bracket notation with an added step feature.
|
5 |
+
array_pop_back(list) Returns the list without the last element.
|
6 |
+
array_pop_front(list) Returns the list without the first element.
|
7 |
+
flatten(list_of_lists) Concatenate a list of lists into a single list. This only flattens one level of the list (see examples).
|
8 |
+
len(list) Return the length of the list.
|
9 |
+
list_aggregate(list, name) Executes the aggregate function name on the elements of list. See the List Aggregates section for more details.
|
10 |
+
list_any_value(list) Returns the first non-null value in the list.
|
11 |
+
list_append(list, element) Appends element to list.
|
12 |
+
list_concat(list1, list2) Concatenate two lists. NULL inputs are skipped. See also ||
|
13 |
+
list_contains(list, element) Returns true if the list contains the element.
|
14 |
+
list_cosine_similarity(list1, list2) Compute the cosine similarity between two lists.
|
15 |
+
list_cosine_distance(list1, list2) Compute the cosine distance between two lists. Equivalent to 1.0 - list_cosine_similarity.
|
16 |
+
list_distance(list1, list2) Calculates the Euclidean distance between two points with coordinates given in two inputs lists of equal length.
|
17 |
+
list_distinct(list) Removes all duplicates and NULL values from a list. Does not preserve the original order.
|
18 |
+
list_dot_product(list1, list2) Computes the dot product of two same-sized lists of numbers.
|
19 |
+
list_negative_dot_product(list1, list2) Computes the negative dot product of two same-sized lists of numbers. Equivalent to - list_dot_product.
|
20 |
+
list_extract(list, index) Extract the indexth (1-based) value from the list.
|
21 |
+
list_filter(list, lambda) Constructs a list from those elements of the input list for which the lambda function returns true. See the Lambda Functions page for more details.
|
22 |
+
list_grade_up(list) Works like sort, but the results are the indexes that correspond to the position in the original list instead of the actual values.
|
23 |
+
list_has_all(list, sub-list) Returns true if all elements of sub-list exist in list.
|
24 |
+
list_has_any(list1, list2) Returns true if any elements exist is both lists.
|
25 |
+
list_intersect(list1, list2) Returns a list of all the elements that exist in both l1 and l2, without duplicates.
|
26 |
+
list_position(list, element) Returns the index of the element if the list contains the element. If the element is not found, it returns NULL.
|
27 |
+
list_prepend(element, list) Prepends element to list.
|
28 |
+
list_reduce(list, lambda) Returns a single value that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
|
29 |
+
list_resize(list, size[, value]) Resizes the list to contain size elements. Initializes new elements with value or NULL if value is not set.
|
30 |
+
list_reverse_sort(list) Sorts the elements of the list in reverse order. See the Sorting Lists section for more details about the NULL sorting order.
|
31 |
+
list_reverse(list) Reverses the list.
|
32 |
+
list_select(value_list, index_list) Returns a list based on the elements selected by the index_list.
|
33 |
+
list_slice(list, begin, end, step) list_slice with added step feature.
|
34 |
+
list_slice(list, begin, end) Extract a sublist using slice conventions. Negative values are accepted. See slicing.
|
35 |
+
list_sort(list) Sorts the elements of the list. See the Sorting Lists section for more details about the sorting order and the NULL sorting order.
|
36 |
+
list_transform(list, lambda) Returns a list that is the result of applying the lambda function to each element of the input list. See the Lambda Functions page for more details.
|
37 |
+
list_unique(list) Counts the unique elements of a list.
|
38 |
+
list_value(any, ...) Create a LIST containing the argument values.
|
39 |
+
list_where(value_list, mask_list) Returns a list with the BOOLEANs in mask_list applied as a mask to the value_list.
|
40 |
+
list_zip(list_1, list_2, ...[, truncate]) Zips k LISTs to a new LIST whose length will be that of the longest list. Its elements are structs of k elements from each list list_1, …, list_k, missing elements are replaced with NULL. If truncate is set, all lists are truncated to the smallest list length.
|
41 |
+
unnest(list) Unnests a list by one level. Note that this is a special function that alters the cardinality of the result. See the unnest page for more details.
|
numeric_functions.tsv
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
@(x) Absolute value. Parentheses are optional if x is a column name.
|
3 |
+
abs(x) Absolute value.
|
4 |
+
acos(x) Computes the arccosine of x.
|
5 |
+
add(x, y) Alias for x + y.
|
6 |
+
asin(x) Computes the arcsine of x.
|
7 |
+
atan(x) Computes the arctangent of x.
|
8 |
+
atan2(y, x) Computes the arctangent (y, x).
|
9 |
+
bit_count(x) Returns the number of bits that are set.
|
10 |
+
cbrt(x) Returns the cube root of the number.
|
11 |
+
ceil(x) Rounds the number up.
|
12 |
+
ceiling(x) Rounds the number up. Alias of ceil.
|
13 |
+
cos(x) Computes the cosine of x.
|
14 |
+
cot(x) Computes the cotangent of x.
|
15 |
+
degrees(x) Converts radians to degrees.
|
16 |
+
divide(x, y) Alias for x // y.
|
17 |
+
even(x) Round to next even number by rounding away from zero.
|
18 |
+
exp(x) Computes e ** x.
|
19 |
+
factorial(x) See ! operator. Computes the product of the current integer and all integers below it.
|
20 |
+
fdiv(x, y) Performs integer division (x // y) but returns a DOUBLE value.
|
21 |
+
floor(x) Rounds the number down.
|
22 |
+
fmod(x, y) Calculates the modulo value. Always returns a DOUBLE value.
|
23 |
+
gamma(x) Interpolation of the factorial of x - 1. Fractional inputs are allowed.
|
24 |
+
gcd(x, y) Computes the greatest common divisor of x and y.
|
25 |
+
greatest_common_divisor(x, y) Computes the greatest common divisor of x and y.
|
26 |
+
greatest(x1, x2, ...) Selects the largest value.
|
27 |
+
isfinite(x) Returns true if the floating point value is finite, false otherwise.
|
28 |
+
isinf(x) Returns true if the floating point value is infinite, false otherwise.
|
29 |
+
isnan(x) Returns true if the floating point value is not a number, false otherwise.
|
30 |
+
lcm(x, y) Computes the least common multiple of x and y.
|
31 |
+
least_common_multiple(x, y) Computes the least common multiple of x and y.
|
32 |
+
least(x1, x2, ...) Selects the smallest value.
|
33 |
+
lgamma(x) Computes the log of the gamma function.
|
34 |
+
ln(x) Computes the natural logarithm of x.
|
35 |
+
log(x) Computes the base-10 logarithm of x.
|
36 |
+
log10(x) Alias of log. Computes the base-10 logarithm of x.
|
37 |
+
log2(x) Computes the base-2 log of x.
|
38 |
+
multiply(x, y) Alias for x * y.
|
39 |
+
nextafter(x, y) Return the next floating point value after x in the direction of y.
|
40 |
+
pi() Returns the value of pi.
|
41 |
+
pow(x, y) Computes x to the power of y.
|
42 |
+
power(x, y) Alias of pow. computes x to the power of y.
|
43 |
+
radians(x) Converts degrees to radians.
|
44 |
+
random() Returns a random number x in the range 0.0 <= x < 1.0.
|
45 |
+
round_even(v NUMERIC, s INTEGER) Alias of roundbankers(v, s). Round to s decimal places using the rounding half to even rule. Values s < 0 are allowed.
|
46 |
+
round(v NUMERIC, s INTEGER) Round to s decimal places. Values s < 0 are allowed.
|
47 |
+
setseed(x) Sets the seed to be used for the random function.
|
48 |
+
sign(x) Returns the sign of x as -1, 0 or 1.
|
49 |
+
signbit(x) Returns whether the signbit is set or not.
|
50 |
+
sin(x) Computes the sin of x.
|
51 |
+
sqrt(x) Returns the square root of the number.
|
52 |
+
subtract(x, y) Alias for x - y.
|
53 |
+
tan(x) Computes the tangent of x.
|
54 |
+
trunc(x) Truncates the number.
|
55 |
+
xor(x, y) Bitwise XOR.
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
|
|
1 |
duckdb
|
|
|
1 |
+
pyarrow
|
2 |
duckdb
|
time_functions.tsv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
current_time Current time (start of current transaction).
|
3 |
+
date_diff(part, starttime, endtime) The number of partition boundaries between the times.
|
4 |
+
date_part(part, time) Get subfield (equivalent to extract).
|
5 |
+
date_sub(part, starttime, endtime) The number of complete partitions between the times.
|
6 |
+
datediff(part, starttime, endtime) Alias of date_diff. The number of partition boundaries between the times.
|
7 |
+
datepart(part, time) Alias of date_part. Get subfield (equivalent to extract).
|
8 |
+
datesub(part, starttime, endtime) Alias of date_sub. The number of complete partitions between the times.
|
9 |
+
extract(part FROM time) Get subfield from a time.
|
10 |
+
get_current_time() Current time (start of current transaction).
|
11 |
+
make_time(bigint, bigint, double) The time for the given parts.
|
timestamp_functions.tsv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Name Description
|
2 |
+
age(timestamp, timestamp) Subtract arguments, resulting in the time difference between the two timestamps.
|
3 |
+
age(timestamp) Subtract from current_date.
|
4 |
+
century(timestamp) Extracts the century of a timestamp.
|
5 |
+
current_timestamp Returns the current timestamp (at the start of the transaction).
|
6 |
+
date_diff(part, startdate, enddate) The number of partition boundaries between the timestamps.
|
7 |
+
date_part([part, ...], timestamp) Get the listed subfields as a struct. The list must be constant.
|
8 |
+
date_part(part, timestamp) Get subfield (equivalent to extract).
|
9 |
+
date_sub(part, startdate, enddate) The number of complete partitions between the timestamps.
|
10 |
+
date_trunc(part, timestamp) Truncate to specified precision.
|
11 |
+
datediff(part, startdate, enddate) Alias of date_diff. The number of partition boundaries between the timestamps.
|
12 |
+
datepart([part, ...], timestamp) Alias of date_part. Get the listed subfields as a struct. The list must be constant.
|
13 |
+
datepart(part, timestamp) Alias of date_part. Get subfield (equivalent to extract).
|
14 |
+
datesub(part, startdate, enddate) Alias of date_sub. The number of complete partitions between the timestamps.
|
15 |
+
datetrunc(part, timestamp) Alias of date_trunc. Truncate to specified precision.
|
16 |
+
dayname(timestamp) The (English) name of the weekday.
|
17 |
+
epoch_ms(ms) Converts ms since epoch to a timestamp.
|
18 |
+
epoch_ms(timestamp) Converts a timestamp to milliseconds since the epoch.
|
19 |
+
epoch_ms(timestamp) Return the total number of milliseconds since the epoch.
|
20 |
+
epoch_ns(timestamp) Return the total number of nanoseconds since the epoch.
|
21 |
+
epoch_us(timestamp) Return the total number of microseconds since the epoch.
|
22 |
+
epoch(timestamp) Converts a timestamp to seconds since the epoch.
|
23 |
+
extract(field FROM timestamp) Get subfield from a timestamp.
|
24 |
+
greatest(timestamp, timestamp) The later of two timestamps.
|
25 |
+
isfinite(timestamp) Returns true if the timestamp is finite, false otherwise.
|
26 |
+
isinf(timestamp) Returns true if the timestamp is infinite, false otherwise.
|
27 |
+
last_day(timestamp) The last day of the month.
|
28 |
+
least(timestamp, timestamp) The earlier of two timestamps.
|
29 |
+
make_timestamp(bigint, bigint, bigint, bigint, bigint, double) The timestamp for the given parts.
|
30 |
+
make_timestamp(microseconds) The timestamp for the given number of µs since the epoch.
|
31 |
+
monthname(timestamp) The (English) name of the month.
|
32 |
+
strftime(timestamp, format) Converts timestamp to string according to the format string.
|
33 |
+
strptime(text, format-list) Converts the string text to timestamp applying the format strings in the list until one succeeds. Throws an error on failure. To return NULL on failure, use try_strptime.
|
34 |
+
strptime(text, format) Converts the string text to timestamp according to the format string. Throws an error on failure. To return NULL on failure, use try_strptime.
|
35 |
+
time_bucket(bucket_width, timestamp[, offset]) Truncate timestamp by the specified interval bucket_width. Buckets are offset by offset interval.
|
36 |
+
time_bucket(bucket_width, timestamp[, origin]) Truncate timestamp by the specified interval bucket_width. Buckets are aligned relative to origin timestamp. origin defaults to 2000-01-03 00:00:00 for buckets that don't include a month or year interval, and to 2000-01-01 00:00:00 for month and year buckets.
|
37 |
+
to_timestamp(double) Converts seconds since the epoch to a timestamp with time zone.
|
38 |
+
try_strptime(text, format-list) Converts the string text to timestamp applying the format strings in the list until one succeeds. Returns NULL on failure.
|
39 |
+
try_strptime(text, format) Converts the string text to timestamp according to the format string. Returns NULL on failure.
|