File size: 4,445 Bytes
6cc94b6
 
 
 
 
 
 
a99513a
6cc94b6
74aa7d2
6cc94b6
 
74aa7d2
 
6cc94b6
74aa7d2
 
b353329
6cc94b6
 
 
 
 
 
 
74aa7d2
a99513a
 
 
 
6cc94b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74aa7d2
6cc94b6
 
 
74aa7d2
 
 
 
 
6cc94b6
8f2eed8
6cc94b6
 
 
 
74aa7d2
6cc94b6
 
 
 
 
74aa7d2
 
6cc94b6
74aa7d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6cc94b6
 
 
 
 
 
 
74aa7d2
a99513a
 
6abb81d
8f2eed8
a99513a
8f2eed8
a99513a
74aa7d2
a99513a
 
74aa7d2
 
 
a99513a
74aa7d2
 
 
a99513a
74aa7d2
 
 
 
 
 
 
 
a99513a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import os
import tempfile
from pathlib import Path
import secrets
import dashscope
from dashscope import MultiModalConversation, Generation
from PIL import Image

# API key setup
YOUR_API_TOKEN = os.getenv('YOUR_API_TOKEN')
dashscope.api_key = YOUR_API_TOKEN

# Global variables
math_messages = []
image_descriptions = []

def process_image(image, shouldConvert=False):
    uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(
        Path(tempfile.gettempdir()) / "gradio"
    )
    os.makedirs(uploaded_file_dir, exist_ok=True)
    
    name = f"tmp{secrets.token_hex(20)}.jpg"
    filename = os.path.join(uploaded_file_dir, name)
    
    if shouldConvert:
        new_img = Image.new('RGB', size=(image.width, image.height), color=(255, 255, 255))
        new_img.paste(image, (0, 0), mask=image)
        image = new_img
    image.save(filename)
    
    messages = [{
        'role': 'system',
        'content': [{'text': 'You are a helpful assistant.'}]
    }, {
        'role': 'user',
        'content': [
            {'image': f'file://{filename}'},
            {'text': 'Please describe the math-related content in this image, ensuring that any LaTeX formulas are correctly transcribed. Non-mathematical details do not need to be described.'}
        ]
    }]
    
    response = MultiModalConversation.call(model='qwen-vl-max-0809', messages=messages)
    
    os.remove(filename)
    
    return response.output.choices[0]["message"]["content"]

def get_math_response(image_descriptions, user_question):
    global math_messages
    if not math_messages:
        math_messages.append({'role': 'system', 'content': 'You are a helpful math assistant.'})
    
    content = "Image descriptions:\n" + "\n".join(image_descriptions) if image_descriptions else ""
    content += f"\n\nUser question: {user_question}"
    
    math_messages.append({'role': 'user', 'content': content})
    response = Generation.call(	
        model="qwen2.5-math-72b-instruct",
        messages=math_messages,	
        result_format='message',
        stream=True
    )
    answer = ""
    for resp in response:
        if resp.output is None:
            continue
        answer = resp.output.choices[0].message.content
        yield answer.replace("\\", "\\\\")
    
    math_messages.append({'role': 'assistant', 'content': answer})

def math_chat_bot(images, sketchpad, question, chat_history):
    global image_descriptions
    
    # Process new images
    for image in images:
        if image:
            description = process_image(image)
            image_descriptions.append(description)
    
    # Process sketchpad if present
    if sketchpad and sketchpad["composite"]:
        sketch_description = process_image(sketchpad["composite"], True)
        image_descriptions.append(sketch_description)
    
    # Generate response
    response = ""
    for chunk in get_math_response(image_descriptions, question):
        response += chunk
        yield chat_history + [(question, response)]

css = """
#qwen-md .katex-display { display: inline; }
#qwen-md .katex-display>.katex { display: inline; }
#qwen-md .katex-display>.katex>.katex-html { display: inline; }
"""

# Create Gradio interface
with gr.Blocks(css=css) as demo:
    gr.HTML("""\
<p align="center"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png" style="height: 60px"/><p>"""
            """<center><font size=8>📖 Qwen2.5-Math Demo</center>"""
            """\
<center><font size=3>This WebUI is based on Qwen2-VL for OCR and Qwen2.5-Math for mathematical reasoning. You can input either images or texts of mathematical or arithmetic problems.</center>"""
            )
    
    with gr.Row():
        with gr.Column():
            input_images = gr.File(file_count="multiple", label="Upload Images")
            input_sketchpad = gr.Sketchpad(type="pil", label="Sketch", layers=False)
            input_text = gr.Textbox(label="Input your question")
            with gr.Row():
                clear_btn = gr.ClearButton([input_images, input_sketchpad, input_text])
                submit_btn = gr.Button("Submit", variant="primary")
        
        with gr.Column():
            chat_output = gr.Chatbot(label="Chat History", elem_id="qwen-md")
    
    submit_btn.click(
        fn=math_chat_bot,
        inputs=[input_images, input_sketchpad, input_text, chat_output],
        outputs=chat_output
    )

demo.launch()