import os import duckdb import gradio as gr from dotenv import load_dotenv from httpx import Client from huggingface_hub import HfApi from huggingface_hub.utils import logging from llama_cpp import Llama load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables" BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" API_URL = "https://m82etjwvhoptr3t5.us-east-1.aws.endpoints.huggingface.cloud" headers = { "Accept" : "application/json", "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json" } logger = logging.get_logger(__name__) client = Client(headers=headers) api = HfApi(token=HF_TOKEN) llama = Llama( model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf", n_ctx=2048, ) def get_first_parquet(dataset: str): resp = client.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}") return resp.json()["parquet_files"][0] def query_remote_model(text): payload = { "inputs": text, "parameters": {} } response = client.post(API_URL, headers=headers, json=payload) pred = response.json() return pred[0]["generated_text"] def query_local_model(text): pred = llama(text, temperature=0.1, max_tokens=500) return pred["choices"][0]["text"] def text2sql(dataset_name, query_input): print(f"start text2sql for {dataset_name}") try: first_parquet = get_first_parquet(dataset_name) except Exception as e: return f"❌ Dataset does not exist or is not supported {e}" first_parquet_url = first_parquet["url"] print(first_parquet_url) con = duckdb.connect() con.execute("INSTALL 'httpfs'; LOAD httpfs;") con.execute(f"CREATE TABLE data as SELECT * FROM '{first_parquet_url}' LIMIT 1;") result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df() con.close() ddl_create = result.iloc[0,0] text = f"""### Instruction: Your task is to generate valid duckdb SQL to answer the following question. ### Input: Here is the database schema that the SQL query will run on: {ddl_create} ### Question: {query_input} ### Response (use duckdb shorthand if possible): """ print(text) # sql_output = query_remote_model(text) sql_output = query_local_model(text) return sql_output with gr.Blocks() as demo: gr.Markdown("# Talk to your dataset") gr.Markdown("This space shows how to talk to your datasets: Get a brief description, create SQL queries, and get results.") gr.Markdown("Generate SQL queries'") dataset_name = gr.Textbox("sksayril/medicine-info", label="Dataset Name") query_input = gr.Textbox("How many rows there are?", label="Ask something about your data") btn = gr.Button("Generate SQL") query_output = gr.Textbox(label="Output SQL", interactive= False) btn.click(text2sql, inputs=[dataset_name, query_input], outputs=query_output) demo.launch()