File size: 2,046 Bytes
f6a98e4
a997f34
f6a98e4
d35379a
8f558df
f6a98e4
 
 
4dd72e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97313a7
723de5f
97313a7
 
723de5f
97313a7
 
 
723de5f
f6a98e4
723de5f
 
 
 
 
f6a98e4
4dd72e8
f6a98e4
4dd72e8
 
 
 
723de5f
 
 
 
 
 
 
f6a98e4
 
723de5f
ae0aef7
 
 
 
 
8f558df
723de5f
f6a98e4
723de5f
 
 
 
 
 
 
 
 
 
 
f6a98e4
 
723de5f
97313a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import spaces
import os
import time
import torch
import gradio as gr
from threading import Thread
from PIL import Image

# Install required packages
import subprocess
subprocess.run('pip install --upgrade transformers', shell=True)
subprocess.run('pip install accelerate', shell=True)

from transformers import AutoProcessor, AutoModelForCausalLM

# Model and processor initialization with trust_remote_code=True
processor = AutoProcessor.from_pretrained(
    "Qwen/QVQ-72B-Preview",
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/QVQ-72B-Preview",
    trust_remote_code=True,
    device_map="auto"
).eval()

# Footer
footer = """
<div style="text-align: center; margin-top: 20px;">
    <p>Powered by QVQ-72B Model</p>
</div>
"""

# Vision model function
@spaces.GPU()
def process_image(image, text_input=None):
    # Convert image to PIL format
    image = Image.fromarray(image).convert("RGB")
    
    # Prepare inputs
    if text_input:
        inputs = processor(text=text_input, images=image, return_tensors="pt")
    else:
        inputs = processor(images=image, return_tensors="pt")
    
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate output
    outputs = model.generate(**inputs, max_new_tokens=1000)
    
    # Decode response
    response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    
    return response

# CSS styling
css = """
footer {
    visibility: hidden;
}
"""

# Gradio interface
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
    with gr.Row():
        input_img = gr.Image(label="Input Image")
    with gr.Row():
        text_input = gr.Textbox(label="Question (Optional)")
    with gr.Row():
        submit_btn = gr.Button(value="Submit")
    with gr.Row():
        output_text = gr.Textbox(label="Response")

    submit_btn.click(process_image, [input_img, text_input], [output_text])
    
    gr.HTML(footer)

# Launch the app
demo.launch(debug=True)