from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import gradio as gr
import torch

login(token = os.getenv('HF_TOKEN'))

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    device_map="auto",
    torch_dtype="auto",
)

def generate_response(message, history):
    inputs = tokenizer(message['text'], return_tensors="pt").to("cpu")
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

demo = gr.ChatInterface(
    fn=generate_response, 
    examples=[{"text": "Hello", "files": []}], 
    title="LLAMA 3.2 Chat", 
    multimodal=True
)
demo.launch(debug = True)