|
@tool |
|
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str: |
|
""" |
|
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence. |
|
Args: |
|
text: The text to search for |
|
nth_result: Which occurrence to jump to (default: 1) |
|
""" |
|
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]") |
|
if nth_result > len(elements): |
|
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)") |
|
result = f"Found {len(elements)} matches for '{text}'." |
|
elem = elements[nth_result - 1] |
|
driver.execute_script("arguments[0].scrollIntoView(true);", elem) |
|
result += f"Focused on element {nth_result} of {len(elements)}" |
|
return result |
|
|
|
|
|
@tool |
|
def go_back() -> None: |
|
"""Goes back to previous page.""" |
|
driver.back() |
|
|
|
|
|
@tool |
|
def close_popups() -> str: |
|
""" |
|
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners. |
|
""" |
|
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform() |
|
|
|
def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None: |
|
sleep(1.0) |
|
driver = helium.get_driver() |
|
current_step = step_log.step_number |
|
if driver is not None: |
|
for step_logs in agent.logs: |
|
if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2: |
|
step_logs.observations_images = None |
|
png_bytes = driver.get_screenshot_as_png() |
|
image = Image.open(BytesIO(png_bytes)) |
|
print(f"Captured a browser screenshot: {image.size} pixels") |
|
step_log.observations_images = [image.copy()] |
|
|
|
|
|
url_info = f"Current url: {driver.current_url}" |
|
step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info |
|
return |
|
|
|
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool |
|
model = OpenAIServerModel(model_id="gpt-4o") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent = CodeAgent( |
|
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], |
|
model=model, |
|
additional_authorized_imports=["helium"], |
|
step_callbacks=[save_screenshot], |
|
max_steps=20, |
|
verbosity_level=2, |
|
) |
|
|
|
prompt_analysis="""Extract information from an image by analyzing and interpreting its visual elements to provide a detailed description or identify specific data. |
|
|
|
# Steps |
|
|
|
1. **Analyze the Image**: Identify key elements such as objects, text, colors, and any notable features or contexts. |
|
2. **Interpret Visual Elements**: Determine the significance or purpose of the elements identified. Consider relationships between objects, text recognition if applicable, and any context clues. |
|
3. **Synthesize Information**: Bring together the interpreted elements to form a coherent understanding or summary. |
|
4. **Verify Details**: Ensure accuracy by cross-referencing identifiable text or icons with known data or references, if relevant. |
|
|
|
# Output Format |
|
|
|
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each identified element should be clearly described along with its context or significance. |
|
|
|
# Example |
|
|
|
**Input**: (An image with a storefront displaying 'Bakery' sign and a variety of bread on display.) |
|
|
|
**Output**: |
|
|
|
Description: |
|
- **Storefront**: A bakery |
|
- **Signage Text**: "Bakery" |
|
- **Products**: Various types of bread |
|
|
|
JSON Example: |
|
```json |
|
{ |
|
"storeType": "Bakery", |
|
"signText": "Bakery", |
|
"products": ["bread", "baguette", "pastry"] |
|
} |
|
``` |
|
|
|
# Notes |
|
|
|
- Consider optical character recognition (OCR) for text extraction. |
|
- Evaluate colors and objects for brand or function associations. |
|
- Provide a holistic overview rather than disjointed elements when possible.""" |
|
|
|
|
|
prompt_deep_analysis="""Extract information from a video by analyzing and interpreting its audiovisual elements to provide a detailed description or identify specific data. |
|
|
|
You will have specific information to retrieve from the video. Adapt analysis steps to cater for motion, audio, and potential scene changes unique to video content. |
|
|
|
# Steps |
|
|
|
1. **Parse the Video**: Break down the video into manageable segments, focusing on scenes or timeframes relevant to the target information. |
|
2. **Identify Key Elements**: Within these segments, identify crucial visual and audio elements such as objects, text, dialogue, sounds, and any notable features or contexts. |
|
3. **Interpret Audiovisual Elements**: Determine the significance or purpose of the identified elements. Consider relationships between objects, text recognition, audio cues, and any context provided by the video. |
|
4. **Synthesize Information**: Integrate the interpreted elements to form a coherent understanding or summary. |
|
5. **Verify Details**: Ensure accuracy by cross-referencing identifiable text, icons, or audio snippets with known data or references, if relevant. |
|
|
|
# Output Format |
|
|
|
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each element should be described along with its context or significance within the video. |
|
|
|
# Examples |
|
|
|
**Input**: (A video of a cooking show with captions and background music.) |
|
|
|
**Output**: |
|
|
|
Description: |
|
- **Scene**: Cooking demonstration of a pasta dish |
|
- **Captions**: Step-by-step instructions |
|
- **Audio**: Background music, presenter dialogue |
|
- **Visual Elements**: Ingredients and cooking utensils |
|
|
|
JSON Example: |
|
```json |
|
{ |
|
"sceneType": "Cooking Demonstration", |
|
"captions": ["Boil water", "Add pasta"], |
|
"audio": { |
|
"backgroundMusic": "light jazz", |
|
"dialogue": ["Today we are making pasta..."] |
|
}, |
|
"visualElements": ["pasta", "saucepan", "spoon"] |
|
} |
|
``` |
|
|
|
# Notes |
|
|
|
- Consider using video timestamp and scene identification for accurate element referencing. |
|
- Evaluate both visual and audio elements for context comprehension. |
|
- Ensure that video dynamics like scene changes or motion are accounted for in the synthesis of information.""" |
|
|
|
|
|
|