Spaces:
Paused
Paused
init
Browse files- llava/conversation.py +3 -1
- llava/serve/gradio_web_server.py +4 -17
- pyproject.toml +1 -1
llava/conversation.py
CHANGED
@@ -162,7 +162,7 @@ class Conversation:
|
|
162 |
images.append(image)
|
163 |
return images
|
164 |
|
165 |
-
def to_gradio_chatbot(self,extra_image=None):
|
166 |
ret = []
|
167 |
for i, (role, msg) in enumerate(reversed(self.messages[self.offset:])):
|
168 |
if role==self.roles[0]:
|
@@ -195,6 +195,8 @@ class Conversation:
|
|
195 |
image_format='JPEG')
|
196 |
img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
|
197 |
msg=img_str
|
|
|
|
|
198 |
ret.append([msg, None])
|
199 |
break
|
200 |
return ret
|
|
|
162 |
images.append(image)
|
163 |
return images
|
164 |
|
165 |
+
def to_gradio_chatbot(self,extra_image=None,extra_coordinates=None):
|
166 |
ret = []
|
167 |
for i, (role, msg) in enumerate(reversed(self.messages[self.offset:])):
|
168 |
if role==self.roles[0]:
|
|
|
195 |
image_format='JPEG')
|
196 |
img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
|
197 |
msg=img_str
|
198 |
+
if not extra_coordinates:
|
199 |
+
msg=f"The element is at {extra_coordinates} on the screen: " +msg
|
200 |
ret.append([msg, None])
|
201 |
break
|
202 |
return ret
|
llava/serve/gradio_web_server.py
CHANGED
@@ -70,19 +70,6 @@ from PIL import Image, ImageDraw
|
|
70 |
|
71 |
|
72 |
def draw_circle_on_image(image, x, y, radius=20, color=(255, 0, 0)):
|
73 |
-
"""
|
74 |
-
在给定的图片上绘制一个红色圆圈,并返回新的图片。如果 x, y 坐标不在图片范围内,
|
75 |
-
并且 y 超出了图片高度,则尝试将 y 减去 224;如果调整后的 y 仍然超出范围,则返回原图。
|
76 |
-
|
77 |
-
参数:
|
78 |
-
- image: 传入的 PIL.Image 对象
|
79 |
-
- x, y: 圆心的绝对坐标
|
80 |
-
- radius: 圆圈的半径,默认为 10
|
81 |
-
- color: 圆圈的颜色,默认为红色 (255, 0, 0)
|
82 |
-
|
83 |
-
返回:
|
84 |
-
- 带有红色圆圈的 PIL.Image 对象,或者在坐标不合法时返回原图。
|
85 |
-
"""
|
86 |
# 获取图片的宽度和高度
|
87 |
img_width, img_height = image.size
|
88 |
|
@@ -108,9 +95,9 @@ def draw_circle_on_image(image, x, y, radius=20, color=(255, 0, 0)):
|
|
108 |
right_down_point = (x + radius, y + radius)
|
109 |
|
110 |
# 绘制圆圈 (outline 参数设置圆圈的颜色,width 设置线条粗细)
|
111 |
-
draw.ellipse([left_up_point, right_down_point], outline=color, width=
|
112 |
|
113 |
-
return image
|
114 |
|
115 |
def get_conv_log_filename():
|
116 |
t = datetime.datetime.now()
|
@@ -391,9 +378,9 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
|
|
391 |
if len(all_images) > 0:
|
392 |
# 假设我们对第一张图片进行 resize 并展示
|
393 |
|
394 |
-
resized_image = draw_circle_on_image(resize_image(all_images[0]),original_coord[0],original_coord[1])
|
395 |
# state.append_message(state.roles[1], ("", resized_image,"Default"))
|
396 |
-
yield (state, state.to_gradio_chatbot(resized_image)) + (enable_btn,) * 5
|
397 |
|
398 |
with open(get_conv_log_filename(), "a") as fout:
|
399 |
data = {
|
|
|
70 |
|
71 |
|
72 |
def draw_circle_on_image(image, x, y, radius=20, color=(255, 0, 0)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
# 获取图片的宽度和高度
|
74 |
img_width, img_height = image.size
|
75 |
|
|
|
95 |
right_down_point = (x + radius, y + radius)
|
96 |
|
97 |
# 绘制圆圈 (outline 参数设置圆圈的颜色,width 设置线条粗细)
|
98 |
+
draw.ellipse([left_up_point, right_down_point], outline=color, width=5)
|
99 |
|
100 |
+
return image,(x,y)
|
101 |
|
102 |
def get_conv_log_filename():
|
103 |
t = datetime.datetime.now()
|
|
|
378 |
if len(all_images) > 0:
|
379 |
# 假设我们对第一张图片进行 resize 并展示
|
380 |
|
381 |
+
resized_image,coordinates = draw_circle_on_image(resize_image(all_images[0]),original_coord[0],original_coord[1])
|
382 |
# state.append_message(state.roles[1], ("", resized_image,"Default"))
|
383 |
+
yield (state, state.to_gradio_chatbot(resized_image,coordinates)) + (enable_btn,) * 5
|
384 |
|
385 |
with open(get_conv_log_filename(), "a") as fout:
|
386 |
data = {
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "uground_demo_test"
|
7 |
-
version = "3.
|
8 |
description = "Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.8"
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "uground_demo_test"
|
7 |
+
version = "3.9"
|
8 |
description = "Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.8"
|