import spaces import os import time import torch from transformers import AutoProcessor, AutoModelForImageTextToText import gradio as gr from threading import Thread from PIL import Image # Model and processor initialization processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview") model = AutoModelForImageTextToText.from_pretrained("Qwen/QVQ-72B-Preview").cuda().eval() # Footer footer = """

Powered by QVQ-72B Model

""" # Vision model function @spaces.GPU() def process_image(image, text_input=None): # Convert image to PIL format image = Image.fromarray(image).convert("RGB") # Prepare inputs if text_input: inputs = processor(text=text_input, images=image, return_tensors="pt").to("cuda:0") else: inputs = processor(images=image, return_tensors="pt").to("cuda:0") # Generate output outputs = model.generate(**inputs, max_new_tokens=1000) # Decode response response = processor.batch_decode(outputs, skip_special_tokens=True)[0] return response # CSS styling css = """ footer { visibility: hidden; } """ # Gradio interface with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Row(): input_img = gr.Image(label="Input Image") with gr.Row(): text_input = gr.Textbox(label="Question (Optional)") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Response") submit_btn.click(process_image, [input_img, text_input], [output_text]) gr.HTML(footer) # Launch the app demo.launch(debug=True)