plcedoz38 commited on
Commit
530f5d3
·
1 Parent(s): 9a8ee80

navigation example

Browse files
Files changed (3) hide show
  1. README.md +32 -90
  2. localization.py +52 -0
  3. navigation.py +186 -0
README.md CHANGED
@@ -80,6 +80,10 @@ benchmark [WebClick](https://huggingface.co/datasets/Hcompany/WebClick).
80
 
81
  ## Get Started with the Model
82
 
 
 
 
 
83
  We provide starter code for the localization task: i.e. image + instruction -> click coordinates
84
 
85
  We also provide code to reproduce screenspot evaluations: screenspot_eval.py
@@ -151,109 +155,47 @@ resized_height, resized_width = smart_resize(
151
  max_pixels=image_processor.max_pixels,
152
  )
153
  image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
154
-
155
- instruction = "Select July 14th as the check-out date"
156
  ```
157
 
158
- ### Localization as click(x, y)
159
 
160
  ```python
161
- def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
162
- guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
163
-
164
- return [
165
- {
166
- "role": "user",
167
- "content": [
168
- {
169
- "type": "image",
170
- "image": image,
171
- },
172
- {"type": "text", "text": f"{guidelines}\n{instruction}"},
173
- ],
174
- }
175
- ]
176
-
177
-
178
- messages = get_localization_prompt(image, instruction)
179
- coordinates_str = run_inference(messages)[0]
180
- print(coordinates_str)
181
- # Expected Click(352, 348)
182
  ```
183
 
184
- ### Structured Output
185
-
186
- We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
187
 
188
  ```python
189
- from pydantic import BaseModel, ConfigDict
190
-
191
- class FunctionDefinition(BaseModel):
192
- """Function definition data structure.
193
-
194
- Attributes:
195
- name: name of the function.
196
- description: description of the function.
197
- parameters: JSON schema for the function parameters.
198
- strict: Whether to enable strict schema adherence when generating the function call.
199
- """
200
-
201
- name: str
202
- description: str = ""
203
- parameters: dict[str, Any] = {}
204
- strict: bool = True
205
-
206
-
207
- class ClickAction(BaseModel):
208
- """Click at specific coordinates on the screen."""
209
 
210
- model_config = ConfigDict(
211
- extra="forbid",
212
- json_schema_serialization_defaults_required=True,
213
- json_schema_mode_override="serialization",
214
- use_attribute_docstrings=True,
215
- )
216
-
217
- action: Literal["click"] = "click"
218
- x: int
219
- """The x coordinate, number of pixels from the left edge."""
220
- y: int
221
- """The y coordinate, number of pixels from the top edge."""
222
 
 
223
 
224
- function_definition = FunctionDefinition(
225
- name="click_action",
226
- description=ClickAction.__doc__ or "",
227
- parameters=ClickAction.model_json_schema(),
228
- strict=True,
229
- )
230
 
 
 
 
231
 
232
- def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
233
- guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
234
-
235
- return [
236
- {
237
- "role": "system",
238
- "content": json.dumps([function_definition.model_dump()]),
239
- },
240
- {
241
- "role": "user",
242
- "content": [
243
- {
244
- "type": "image",
245
- "image": image,
246
- },
247
- {"type": "text", "text": f"{guidelines}\n{instruction}"},
248
- ],
249
- },
250
- ]
251
-
252
-
253
- messages = get_localization_prompt_structured_output(image, instruction)
254
- coordinates_str = run_inference(messages)[0]
255
- coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
256
- print(coordinates)
257
  # Expected ClickAction(action='click', x=352, y=340)
258
  ```
259
 
 
80
 
81
  ## Get Started with the Model
82
 
83
+ We provide 2 spaces to experiment with Localization and Navigation:
84
+ - https://huggingface.co/spaces/Hcompany/Holo1-Navigation
85
+ - https://huggingface.co/spaces/Hcompany/Holo1-Localization
86
+
87
  We provide starter code for the localization task: i.e. image + instruction -> click coordinates
88
 
89
  We also provide code to reproduce screenspot evaluations: screenspot_eval.py
 
155
  max_pixels=image_processor.max_pixels,
156
  )
157
  image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
 
 
158
  ```
159
 
160
+ ### Navigation with Structured Output
161
 
162
  ```python
163
+ import json
164
+ from . import navigation
165
+
166
+ task = "Book a hotel in Paris on August 3rd for 3 nights"
167
+ prompt = navigation.get_navigation_prompt(task, image, step=1)
168
+ navigation_str = run_inference(prompt)[0]
169
+ navigation = NavigationStep(**json.loads(navigation_str))
170
+ print(navigation)
171
+ # Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
 
 
 
 
 
 
 
 
 
 
 
 
172
  ```
173
 
174
+ ### Localization with click(x, y)
 
 
175
 
176
  ```python
177
+ from . import localization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
+ instruction = "Select July 14th as the check-out date"
180
+ prompt = localization.get_localization_prompt(image, instruction)
181
+ coordinates = run_inference(prompt)[0]
182
+ print(coordinates)
183
+ # Expected Click(352, 348)
184
+ ```
 
 
 
 
 
 
185
 
186
+ ### Localization with Structured Output
187
 
188
+ We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
 
 
 
 
 
189
 
190
+ ```python
191
+ import json
192
+ from . import localization
193
 
194
+ instruction = "Select July 14th as the check-out date"
195
+ prompt = localization.get_localization_prompt_structured_output(image, instruction)
196
+ coordinates_structured_str = run_inference(prompt)[0]
197
+ coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
198
+ print(coordinates_structured)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Expected ClickAction(action='click', x=352, y=340)
200
  ```
201
 
localization.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Literal
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
8
+ guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
9
+
10
+ return [
11
+ {
12
+ "role": "user",
13
+ "content": [
14
+ {
15
+ "type": "image",
16
+ "image": image,
17
+ },
18
+ {"type": "text", "text": f"{guidelines}\n{instruction}"},
19
+ ],
20
+ }
21
+ ]
22
+
23
+
24
+ class ClickAction(BaseModel):
25
+ """Click at specific coordinates on the screen."""
26
+
27
+ action: Literal["click"] = "click"
28
+ x: int
29
+ """The x coordinate, number of pixels from the left edge."""
30
+ y: int
31
+ """The y coordinate, number of pixels from the top edge."""
32
+
33
+
34
+ def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
35
+ guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
36
+
37
+ return [
38
+ {
39
+ "role": "system",
40
+ "content": json.dumps([ClickAction.model_json_schema()]),
41
+ },
42
+ {
43
+ "role": "user",
44
+ "content": [
45
+ {
46
+ "type": "image",
47
+ "image": image,
48
+ },
49
+ {"type": "text", "text": f"{guidelines}\n{instruction}"},
50
+ ],
51
+ },
52
+ ]
navigation.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
6
+ In each iteration, you will receive an Observation that includes the last screenshots of a web browser and the current memory of the agent.
7
+ You have also information about the step that the agent is trying to achieve to solve the task.
8
+ Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
9
+ You should detail your thought (i.e. reasoning steps) before taking the action.
10
+ Also detail in the notes field of the action the extracted information relevant to solve the task.
11
+ Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
12
+ This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
13
+
14
+ Guidelines:
15
+ - store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
16
+ - Use both the task and the step information to decide what to do
17
+ - if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
18
+ - If there is a cookies notice, always accept all the cookies first
19
+ - The observation is the screenshot of the current page and the memory of the agent.
20
+ - If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
21
+ - If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
22
+ - If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
23
+ - In the answer action, give as many details a possible relevant to answering the task.
24
+ - if you want to write, don't click before. Directly use the write action
25
+ - to write, identify the web element which is type and the text it already contains
26
+ - If you want to use a search bar, directly write text in the search bar
27
+ - Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
28
+ - Don't scroll if you are at the end of the webpage
29
+ - Only refresh if you identify a rate limit problem
30
+ - If you are looking for a single flights, click on round-trip to select 'one way'
31
+ - Never try to login, enter email or password. If there is a need to login, then go back.
32
+ - If you are facing a captcha on a website, try to solve it.
33
+
34
+ - if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
35
+ - The current date is {timestamp}.
36
+
37
+ # <output_json_format>
38
+ # ```json
39
+ # {output_format}
40
+ # ```
41
+ # </output_json_format>
42
+
43
+ """
44
+
45
+
46
+ class ClickElementAction(BaseModel):
47
+ """Click at absolute coordinates of a web element with its description"""
48
+
49
+ action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
50
+ element: str = Field(description="text description of the element")
51
+ x: int = Field(description="The x coordinate, number of pixels from the left edge.")
52
+ y: int = Field(description="The y coordinate, number of pixels from the top edge.")
53
+
54
+ def log(self):
55
+ return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
56
+
57
+
58
+ class WriteElementAction(BaseModel):
59
+ """Write content at absolute coordinates of a web element identified by its description, then press Enter."""
60
+
61
+ action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
62
+ content: str = Field(description="Content to write")
63
+ element: str = Field(description="Text description of the element")
64
+ x: int = Field(description="The x coordinate, number of pixels from the left edge.")
65
+ y: int = Field(description="The y coordinate, number of pixels from the top edge.")
66
+
67
+ def log(self):
68
+ return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
69
+
70
+
71
+ class ScrollAction(BaseModel):
72
+ """Scroll action with no required element"""
73
+
74
+ action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
75
+ direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
76
+
77
+ def log(self):
78
+ return f"I have scrolled {self.direction}"
79
+
80
+
81
+ class GoBackAction(BaseModel):
82
+ """Action to navigate back in browser history"""
83
+
84
+ action: Literal["go_back"] = Field(description="Navigate to the previous page")
85
+
86
+ def log(self):
87
+ return "I have gone back to the previous page"
88
+
89
+
90
+ class RefreshAction(BaseModel):
91
+ """Action to refresh the current page"""
92
+
93
+ action: Literal["refresh"] = Field(description="Refresh the current page")
94
+
95
+ def log(self):
96
+ return "I have refreshed the page"
97
+
98
+
99
+ class GotoAction(BaseModel):
100
+ """Action to go to a particular URL"""
101
+
102
+ action: Literal["goto"] = Field(description="Goto a particular URL")
103
+ url: str = Field(description="A url starting with http:// or https://")
104
+
105
+ def log(self):
106
+ return f"I have navigated to the URL {self.url}"
107
+
108
+
109
+ class WaitAction(BaseModel):
110
+ """Action to wait for a particular amount of time"""
111
+
112
+ action: Literal["wait"] = Field(description="Wait for a particular amount of time")
113
+ seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
114
+
115
+ def log(self):
116
+ return f"I have waited for {self.seconds} seconds"
117
+
118
+
119
+ class RestartAction(BaseModel):
120
+ """Restart the task from the beginning."""
121
+
122
+ action: Literal["restart"] = "restart"
123
+
124
+ def log(self):
125
+ return "I have restarted the task from the beginning"
126
+
127
+
128
+ class AnswerAction(BaseModel):
129
+ """Return a final answer to the task. This is the last action to call in an episode."""
130
+
131
+ action: Literal["answer"] = "answer"
132
+ content: str = Field(description="The answer content")
133
+
134
+ def log(self):
135
+ return f"I have answered the task with '{self.content}'"
136
+
137
+
138
+ ActionSpace = (
139
+ ClickElementAction
140
+ | WriteElementAction
141
+ | ScrollAction
142
+ | GoBackAction
143
+ | RefreshAction
144
+ | WaitAction
145
+ | RestartAction
146
+ | AnswerAction
147
+ | GotoAction
148
+ )
149
+
150
+
151
+ class NavigationStep(BaseModel):
152
+ note: str = Field(
153
+ default="",
154
+ description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
155
+ )
156
+ thought: str = Field(description="Reasoning about next steps (<4 lines)")
157
+ action: ActionSpace = Field(description="Next action to take")
158
+
159
+
160
+ def get_navigation_prompt(task, image, step=1):
161
+ system_prompt = SYSTEM_PROMPT.format(
162
+ output_format=NavigationStep.model_json_schema(),
163
+ timestamp="2025-06-04 14:16:03",
164
+ )
165
+ return [
166
+ {
167
+ "role": "system",
168
+ "content": [
169
+ {"type": "text", "text": system_prompt},
170
+ ],
171
+ },
172
+ {
173
+ "role": "user",
174
+ "content": [
175
+ {"type": "text", "text": f"<task>\n{task}\n</task>\n"},
176
+ {"type": "text", "text": f"<observation step={step}>\n"},
177
+ {"type": "text", "text": "<screenshot>\n"},
178
+ {
179
+ "type": "image",
180
+ "image": image,
181
+ },
182
+ {"type": "text", "text": "\n</screenshot>\n"},
183
+ {"type": "text", "text": "\n</observation>\n"},
184
+ ],
185
+ },
186
+ ]