| | import spaces |
| |
|
| | import re |
| | from typing import Tuple, Optional |
| |
|
| | import gradio as gr |
| | import numpy as np |
| | from PIL import Image, ImageDraw, ImageFont |
| | from smolvlm_inference import TransformersModel |
| |
|
| | from prompt import OS_SYSTEM_PROMPT |
| |
|
| | |
| | MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI" |
| |
|
| | |
| | print(f"Loading model and processor for {MODEL_ID}...") |
| | model = None |
| | processor = None |
| | model_loaded = False |
| | load_error_message = "" |
| |
|
| |
|
| |
|
| | model = TransformersModel( |
| | model_id=MODEL_ID, |
| | to_device="cuda:0", |
| | ) |
| |
|
| |
|
| | title = "Smol2Operator Demo" |
| |
|
| | description = """ |
| | This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them. |
| | This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities. |
| | This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: |
| | """ |
| |
|
| |
|
| |
|
| | SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT |
| |
|
| |
|
| | def get_navigation_prompt(task, image, step=1): |
| | """ |
| | Get the prompt for the navigation task. |
| | - task: The task to complete |
| | - image: The current screenshot of the web page |
| | - step: The current step of the task |
| | """ |
| | system_prompt = SYSTEM_PROMPT |
| | return [ |
| | { |
| | "role": "system", |
| | "content": [ |
| | {"type": "text", "text": system_prompt}, |
| | ], |
| | }, |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": image, |
| | }, |
| | {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"}, |
| | ], |
| | }, |
| | ] |
| |
|
| |
|
| | def array_to_image(image_array: np.ndarray) -> Image.Image: |
| | if image_array is None: |
| | raise ValueError("No image provided. Please upload an image before submitting.") |
| | |
| | img = Image.fromarray(np.uint8(image_array)) |
| | return img |
| |
|
| |
|
| | def parse_actions_from_response(response: str) -> list[str]: |
| | """Parse actions from model response using regex pattern.""" |
| | pattern = r"<code>\n(.*?)\n</code>" |
| | matches = re.findall(pattern, response, re.DOTALL) |
| | return matches |
| |
|
| |
|
| | def extract_coordinates_from_action(action_code: str) -> list[dict]: |
| | """Extract coordinates from action code for localization actions.""" |
| | localization_actions = [] |
| | |
| | |
| | patterns = { |
| | 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
| | 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', |
| | 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)', |
| | 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)' |
| | } |
| | |
| | for action_type, pattern in patterns.items(): |
| | matches = re.finditer(pattern, action_code) |
| | for match in matches: |
| | if action_type == 'drag': |
| | |
| | from_x, from_y, to_x, to_y = match.groups() |
| | localization_actions.append({ |
| | 'type': 'drag_from', |
| | 'x': float(from_x), |
| | 'y': float(from_y), |
| | 'action': action_type |
| | }) |
| | localization_actions.append({ |
| | 'type': 'drag_to', |
| | 'x': float(to_x), |
| | 'y': float(to_y), |
| | 'action': action_type |
| | }) |
| | else: |
| | |
| | x_val = match.group(1) |
| | y_val = match.group(2) if match.group(2) else x_val |
| | if x_val and y_val: |
| | localization_actions.append({ |
| | 'type': action_type, |
| | 'x': float(x_val), |
| | 'y': float(y_val), |
| | 'action': action_type |
| | }) |
| | |
| | return localization_actions |
| |
|
| |
|
| | def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]: |
| | """Create an image with localization markers drawn on it.""" |
| | if not coordinates: |
| | return None |
| | |
| | |
| | img_copy = original_image.copy() |
| | draw = ImageDraw.Draw(img_copy) |
| | |
| | |
| | width, height = img_copy.size |
| | |
| | |
| | font = ImageFont.load_default() |
| |
|
| | |
| | |
| | colors = { |
| | 'click': 'red', |
| | 'double_click': 'blue', |
| | 'move_mouse': 'green', |
| | 'drag_from': 'orange', |
| | 'drag_to': 'purple' |
| | } |
| | |
| | for i, coord in enumerate(coordinates): |
| | |
| | pixel_x = int(coord['x'] * width) |
| | pixel_y = int(coord['y'] * height) |
| | |
| | |
| | color = colors.get(coord['type'], 'red') |
| | |
| | |
| | circle_radius = 8 |
| | draw.ellipse([ |
| | pixel_x - circle_radius, pixel_y - circle_radius, |
| | pixel_x + circle_radius, pixel_y + circle_radius |
| | ], fill=color, outline='white', width=2) |
| | |
| | |
| | label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})" |
| | if font: |
| | draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font) |
| | else: |
| | draw.text((pixel_x + 10, pixel_y - 10), label, fill=color) |
| | |
| | |
| | if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to': |
| | next_coord = coordinates[i + 1] |
| | end_x = int(next_coord['x'] * width) |
| | end_y = int(next_coord['y'] * height) |
| | |
| | |
| | draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3) |
| | |
| | |
| | arrow_size = 10 |
| | dx = end_x - pixel_x |
| | dy = end_y - pixel_y |
| | length = (dx**2 + dy**2)**0.5 |
| | if length > 0: |
| | dx_norm = dx / length |
| | dy_norm = dy / length |
| | |
| | |
| | arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5 |
| | arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5 |
| | arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5 |
| | arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5 |
| | |
| | draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange') |
| | |
| | return img_copy |
| |
|
| |
|
| | |
| | @spaces.GPU |
| | def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: |
| | input_pil_image = array_to_image(input_numpy_image) |
| | assert isinstance(input_pil_image, Image.Image) |
| |
|
| | prompt = get_navigation_prompt(task, input_pil_image) |
| |
|
| |
|
| | if model is None: |
| | raise ValueError("Model not loaded") |
| | |
| | navigation_str = model.generate(prompt, max_new_tokens=500) |
| | print(f"Navigation string: {navigation_str}") |
| | navigation_str = navigation_str.strip() |
| |
|
| | |
| | actions = parse_actions_from_response(navigation_str) |
| | |
| | |
| | all_coordinates = [] |
| | for action_code in actions: |
| | coordinates = extract_coordinates_from_action(action_code) |
| | all_coordinates.extend(coordinates) |
| | |
| | |
| | localized_image = None |
| | if all_coordinates: |
| | localized_image = create_localized_image(input_pil_image, all_coordinates) |
| | print(f"Found {len(all_coordinates)} localization actions") |
| | |
| | return navigation_str, localized_image |
| |
|
| |
|
| | |
| | example_1_image: str = "./assets/google.png" |
| | example_1_image = Image.open(example_1_image) |
| | example_1_task = "Search for the name of the current UK Prime Minister." |
| |
|
| | example_2_image: str = "./assets/huggingface.png" |
| | example_2_image = Image.open(example_2_image) |
| | example_2_task = "Find the most trending model." |
| |
|
| |
|
| | with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| | gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>") |
| | |
| |
|
| | with gr.Row(): |
| | input_image_component = gr.Image(label="UI Image", height=500) |
| | with gr.Row(): |
| | with gr.Column(): |
| | task_component = gr.Textbox( |
| | label="task", |
| | placeholder="e.g., Search for the name of the current UK Prime Minister.", |
| | info="Type the task you want the model to complete.", |
| | ) |
| | submit_button = gr.Button("Call Agent", variant="primary") |
| |
|
| | with gr.Column(): |
| | output_coords_component = gr.Textbox(label="Agent Output", lines=10) |
| |
|
| | submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component]) |
| |
|
| | gr.Examples( |
| | examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]], |
| | inputs=[input_image_component, task_component], |
| | outputs=[output_coords_component, input_image_component], |
| | fn=navigate, |
| | cache_examples=True, |
| | ) |
| |
|
| | demo.queue(api_open=False) |
| | demo.launch(debug=True, share=True) |
| |
|