import streamlit as st
from transformers import T5ForConditionalGeneration, T5Tokenizer
from peft import get_peft_model, LoraConfig

# Define the same LoRA configuration used during fine-tuning
lora_config = LoraConfig(
    r=8,  # Low-rank parameter
    lora_alpha=32,  # Scaling parameter
    lora_dropout=0.1,  # Dropout rate
    target_modules=["q", "v"],  # The attention layers to apply LoRA to
    bias="none"
)

# Load the model and tokenizer from Hugging Face's hub
model = get_peft_model(T5ForConditionalGeneration.from_pretrained("google/flan-t5-large"), lora_config)
tokenizer = T5Tokenizer.from_pretrained("danrdoran/flan-t5-simplified-squad")

# Streamlit app UI
st.title("AI English Tutor")
st.write("Ask me a question, and I will help you!")

# Sidebar for user to control model generation parameters
st.sidebar.title("Model Parameters")
temperature = st.sidebar.slider("Temperature", 0.1, 1.5, 1.0, 0.1)  # Default 1.0
top_p = st.sidebar.slider("Top-p (Nucleus Sampling)", 0.0, 1.0, 0.9, 0.05)  # Default 0.9
top_k = st.sidebar.slider("Top-k", 0, 100, 50, 1)  # Default 50
# Disable sampling when using beam search
do_sample = st.sidebar.checkbox("Enable Random Sampling", value=False)

# Input field for the student
student_question = st.text_input("Ask your question!")

# Generate and display response using the model's generate() function
if student_question:
    # Prepare the input for the model
    input_text = f"You are a tutor. Explain the answer to this question to a young student: '{student_question}'"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=256)  # Reduced max_length to 256

    # Generate response
    generated_ids = model.generate(
        inputs['input_ids'],
        #max_length=75,
        #min_length=20,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        do_sample=True,  # Disable sampling, using beam search
        #num_beams=2,  # Use beam search
        no_repeat_ngram_size=3,  # Prevent repeating phrases of 3 words or more
        length_penalty=1.0,  # Discourage overly long responses
        early_stopping=False  # Stops when it finds a sufficiently good output
    )

    # Decode the generated response
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    st.write("Tutor's Answer:", response)