from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import login import gradio as gr import torch login(token = os.getenv('HF_TOKEN')) # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct") model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-11B-Vision-Instruct", device_map="auto", torch_dtype="auto", ) def generate_response(message, history): inputs = tokenizer(message['text'], return_tensors="pt").to("cpu") with torch.no_grad(): outputs = model.generate(inputs.input_ids, max_length=100) return tokenizer.decode(outputs[0], skip_special_tokens=True) demo = gr.ChatInterface( fn=generate_response, examples=[{"text": "Hello", "files": []}], title="LLAMA 3.2 Chat", multimodal=True ) demo.launch(debug = True)