PHI35VISION / app.py
aiqtech's picture
Update app.py
723de5f verified
raw
history blame
1.69 kB
import spaces
import os
import time
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
import gradio as gr
from threading import Thread
from PIL import Image
# Model and processor initialization
processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
model = AutoModelForImageTextToText.from_pretrained("Qwen/QVQ-72B-Preview").cuda().eval()
# Footer
footer = """
<div style="text-align: center; margin-top: 20px;">
<p>Powered by QVQ-72B Model</p>
</div>
"""
# Vision model function
@spaces.GPU()
def process_image(image, text_input=None):
# Convert image to PIL format
image = Image.fromarray(image).convert("RGB")
# Prepare inputs
if text_input:
inputs = processor(text=text_input, images=image, return_tensors="pt").to("cuda:0")
else:
inputs = processor(images=image, return_tensors="pt").to("cuda:0")
# Generate output
outputs = model.generate(**inputs, max_new_tokens=1000)
# Decode response
response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
return response
# CSS styling
css = """
footer {
visibility: hidden;
}
"""
# Gradio interface
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
with gr.Row():
input_img = gr.Image(label="Input Image")
with gr.Row():
text_input = gr.Textbox(label="Question (Optional)")
with gr.Row():
submit_btn = gr.Button(value="Submit")
with gr.Row():
output_text = gr.Textbox(label="Response")
submit_btn.click(process_image, [input_img, text_input], [output_text])
gr.HTML(footer)
# Launch the app
demo.launch(debug=True)