import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import TapexTokenizer, BartForConditionalGeneration import pandas as pd import gradio as gr import numpy as np import time import os import random #import pyodbc ''' import pkg_resources # Get a list of installed packages and their versions installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set} # Print the list of packages for package, version in installed_packages.items(): print(f"{package}=={version}") ''' ''' # Replace the connection parameters with your SQL Server information server = 'your_server' database = 'your_database' username = 'your_username' password = 'your_password' driver = 'SQL Server' # This depends on the ODBC driver installed on your system # Create the connection string connection_string = f'DRIVER={{{driver}}};SERVER={server};DATABASE={database};UID={username};PWD={password}' # Connect to the SQL Server conn = pyodbc.connect(connection_string) #============================================================================ # Replace "your_query" with your SQL query to fetch data from the database query = 'SELECT * FROM your_table_name' # Use pandas to read data from the SQL Server and store it in a DataFrame df = pd.read_sql_query(query, conn) # Close the SQL connection conn.close() ''' # Create a sample DataFrame with 3,000 records and 20 columns ''' num_records = 3000 num_columns = 20 data = { f"column_{i}": np.random.randint(0, 100, num_records) for i in range(num_columns) } # Randomize the year and city columns years = list(range(2000, 2023)) # Range of years cities = ["New York", "Los Angeles", "Chicago", "Houston", "Miami"] # List of cities data["year"] = [random.choice(years) for _ in range(num_records)] data["city"] = [random.choice(cities) for _ in range(num_records)] table = pd.DataFrame(data) ''' #table = pd.read_csv(csv_file.name, delimiter=",") #table.fillna(0, inplace=True) #table = table.astype(str) data = { "year": [1896, 1900, 1904, 2004, 2008, 2012], "city": ["athens", "paris", "st. louis", "athens", "beijing", "london"] } table = pd.DataFrame.from_dict(data) # Load the chatbot model chatbot_model_name = "microsoft/DialoGPT-medium" tokenizer = AutoTokenizer.from_pretrained(chatbot_model_name) model = AutoModelForCausalLM.from_pretrained(chatbot_model_name) #cmax_token_limit = tokenizer.max_model_input_sizes[chatbot_model_name] #print(f"Chat bot Maximum token limit for {chatbot_model_name}: {cmax_token_limit}") # Load the SQL Model sql_model_name = "microsoft/tapex-large-finetuned-wtq" sql_tokenizer = TapexTokenizer.from_pretrained(sql_model_name) sql_model = BartForConditionalGeneration.from_pretrained(sql_model_name) #stokenizer = AutoTokenizer.from_pretrained(sql_model_name) #max_token_limit = stokenizer.max_model_input_sizes[sql_model_name] #print(f"SQL Maximum token limit for {sql_model_name}: {max_token_limit}") #sql_response = None conversation_history = [] def chat(input, history=[]): #global sql_response # Check if the user input is a question #is_question = "?" in input ''' if is_question: sql_encoding = sql_tokenizer(table=table, query=input + sql_tokenizer.eos_token, return_tensors="pt") sql_outputs = sql_model.generate(**sql_encoding) sql_response = sql_tokenizer.batch_decode(sql_outputs, skip_special_tokens=True) else: ''' # tokenize the new input sentence new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt') # append the new user input tokens to the chat history bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1) # generate a response history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist() # convert the tokens to text, and then split the responses into the right format response = tokenizer.decode(history[0]).split("<|endoftext|>") response = [(response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)] # convert to tuples of list return response, history def sqlquery(input): #, history=[]): global conversation_history #====================================================================== ''' batch_size = 10 # Number of records in each batch num_records = 3000 # Total number of records in the dataset for start_idx in range(0, num_records, batch_size): end_idx = min(start_idx + batch_size, num_records) # Get a batch of records batch_data = table[start_idx:end_idx] # Tokenize the batch tokenized_batch = sql_tokenizer.batch_encode_plus( batch_data, padding=True, truncation=True, return_tensors="pt" ) # Perform inference with torch.no_grad(): output = sql_model.generate( input_ids=tokenized_batch["input_ids"], max_length=1024, pad_token_id=sql_tokenizer.eos_token_id, ) # Decode the output and process the responses responses = [sql_tokenizer.decode(ids, skip_special_tokens=True) for ids in output] conversation_history.append("User: " + record["question"]) for response in enumerate(responses): # Update conversation history conversation_history.append("Bot: " + response) ''' # ========================================================================== inputs = [input] sql_encoding = sql_tokenizer(table=table, query=input, return_tensors="pt") sql_outputs = sql_model.generate(**sql_encoding) sql_response = sql_tokenizer.batch_decode(sql_outputs, skip_special_tokens=True) #history.append((input, sql_response)) conversation_history.append(("User", input)) conversation_history.append(("Bot", sql_response)) # Build conversation string #conversation = "\n".join([f"User: {user_msg}\nBot: {resp_msg}" for user_msg, resp_msg in conversation_history]) conversation = "\n".join([f"{sender}: {msg}" for sender, msg in conversation_history]) return conversation #return sql_response #return sql_response, history ''' html = "