from sentence_transformers import SentenceTransformer
from huggingface_hub import CommitScheduler
from datasets import Dataset
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import os
from utility import load_from_hub_csv
from DNAseq import DNAseq
from grapher import DNAgrapher
from parameter_extractor import ParameterExtractor

from helper import list_at_index_0, list_at_index_1
from logger import cts_log_file_create, logger, cts_logger


HF_TOKEN = os.environ.get("HF_TOKEN", None)
repo_id = os.environ.get("repo_id", None)

# Create csv file for data logging
log_file_path = cts_log_file_create("flagged")

# Initialise CommitScheduler
scheduler = CommitScheduler(
    repo_id=repo_id,
    repo_type="dataset",
    folder_path=log_file_path.parent,
    path_in_repo="data",
    every=2880,
    private=True,
    token=HF_TOKEN
)

# Load Code-Function Mapping
load_from_hub_csv(path=repo_id,
                     data_file="app/code_function_mapping.csv",
                     token=HF_TOKEN,
                     csv_output_file="code_function_mapping.csv")

def chat_to_sequence(sequence, user_query):
    
    # Sequence to be analysed/queried
    input_sequence = sequence

    # Set DNAseq class expected variable
    dna = input_sequence

    # Model
    model_name = "all-mpnet-base-v2"

    # Load model
    model = SentenceTransformer(model_name)

    # User input
    user_query = user_query

    # Set ParameterExtractor class expected variable
    query = user_query

    # Initialise Graphic Response
    fig = None

    # Initialise Text Response
    response = None

    # Query Code Description Message
    code_descript_message = ''

    # kNN semantic similarity threshold / used to determine if query can execute code
    # kNN semantic similarity values less than the lower threshold should return a code eval response
    # kNN semantic similarity values more than the lower threshold shouldn't return a code eval response
    proximal_lower_threshold = 1.1
    proximal_upper_threshold = 1.4

    threshold_exceeded_message = "Your Query Wasn't Understood. Can You Rephrase The Query"
    threshold_approximate_message = "Your Query Wasn't Understood Clearly. Try Using The Following Query Formats"

    # Load the function mapping CSV file into a pandas DataFrame
    code_function_mapping = pd.read_csv("code_function_mapping.csv")

    # Load reference query database from JSON file back into a DataFrame
    ref_query_df = pd.read_json('reference_query_db.json', orient='records')

    # Create Dataset object using the pandas data frame
    ref_query_ds = Dataset.from_pandas(ref_query_df)

    # Load FAISS index
    ref_query_ds.load_faiss_index('all-mpnet-base-v2_embeddings', 'ref_query_db_index')

    # Create embeddings for user query
    query_embedding = model.encode(user_query)

    # Semantic similarity search user query against sample queries
    index_result = ref_query_ds.get_nearest_examples("all-mpnet-base-v2_embeddings", query_embedding, k=3)
    
    # Retrieve results from dataset object
    scores, examples = index_result

    # Create a DataFrame from the examples dictionary
    result_df = pd.DataFrame(examples)

    # Add the scores as a new column to the DataFrame
    result_df['score'] = scores

    # Sort the DataFrame by the 'Score' column in ascending order
    # FIASS uses kNN as the similarity algorithm / value of 0 indicates an exact match
    sorted_df = result_df.sort_values(by='score', ascending=True)

    # Get the query with the lowest kNN score (first row after sorting)
    ref_question = sorted_df.iloc[0]['question']

    # Get the code for the query with the lowest kNN score (first row after sorting)
    query_code = sorted_df.iloc[0]['code']

    # Get the score for the query with the lowest kNN score (first row after sorting)
    query_score = sorted_df.iloc[0]['score']

    # Description of query code to be executed
    query_code_description = code_function_mapping[code_function_mapping['code'] == query_code]['description'].values[0]

    # Extra log entities
    similarity_metric = "k nearest neighbours"

    ref_question_2 = sorted_df.iloc[1]['question']
    ref_question_3 = sorted_df.iloc[1]['question']
    query_score_2 = sorted_df.iloc[1]['score']
    query_score_3 = sorted_df.iloc[1]['score']

    # logger function log_data parameter input
    log_data = [
        user_query,
        ref_question,
        query_score,
        query_code,
        ref_question_2,
        query_score_2,
        ref_question_3,
        query_score_3,
        similarity_metric,
        model_name,
        proximal_lower_threshold,
        proximal_upper_threshold,
    ]

    # Check the query score against threshold values
    if query_score >= proximal_upper_threshold:
        response = threshold_exceeded_message
        cts_logger(scheduler, log_file_path, log_data, response)
        print(threshold_exceeded_message)

    elif proximal_lower_threshold < query_score < proximal_upper_threshold:
        response = threshold_approximate_message + "\n" + ref_question
        cts_logger(scheduler, log_file_path, log_data, response)
        print(threshold_approximate_message, ref_question)
    else:
        print("Execute query")
        # Define the question
        code = query_code

        # Filter the DataFrame to find the code that matches the question
        matching_row = code_function_mapping[code_function_mapping["code"] == code]

        # Check if there is a match
        if not matching_row.empty:
            function = matching_row.iloc[0]["function"]
            f_response = eval(function)
            if code[0] == 'c':
                response = None
                fig = go.Figure(f_response)
            else:
                response = str(f_response)
                fig = None
            code_descript_message = query_code_description.title()
            cts_logger(scheduler, log_file_path, log_data, response)
        else:
            response = "Error processing query"
            query_code = "No Match Error"
            cts_logger(scheduler, log_file_path, log_data, response)
            print("No matching code found for the function:", code)

        return response, fig, code_descript_message
    return response, fig, code_descript_message


ChatToSequence = gr.Interface(
    fn=chat_to_sequence,
    inputs=[gr.Textbox(label="Sequence", placeholder="Input DNA Sequence..."),
            gr.Textbox(label="Query", placeholder="Input Query...")],
    outputs=[gr.Textbox(label="Response"),
             gr.Plot(label='Graphic Response'),
             gr.Textbox(label="Action Executed")],
    allow_flagging="never",
    title="Chat-To-Sequence",
    description="<h2><center><span style='color: purple;'>This Demo App Allows You To Explore Your DNA Sequence Using Natural Language</span></h2></center>"
                "<h5><center>Disclaimer: The app stores the user queries but doesn't store the DNA sequence."
                " Please Don't Input Any Information You Don't Wish To Share Into The Query Box.<h5><center>",
    theme=gr.themes.Soft(),
    examples=[
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaa",
         "What is the length of the sequence"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaa",
         "How many guanines bases are there in the sequence"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaa",
         "What is the base at position 10"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaa",
         "What are the bases from position 2 to 10"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaa",
         "How many bases are there from position 2 to 10"],
        ["ggcattgaggagaccattgacaccgtcattagcaatgcactacaactgtcacaacctaaaaaa",
         "Show pie chart of total bases"],
    ],
).queue()

ChatToSequence.launch()