Smolagent_to_GAIA_benchmark

Running

App Files Files Community

Smolagent_to_GAIA_benchmark / vision_agent.py

RCaz

Update vision_agent.py

0f4fb47 verified 6 days ago

raw

history blame contribute delete

6.54 kB

	@tool
	def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
	"""
	Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
	Args:
	text: The text to search for
	nth_result: Which occurrence to jump to (default: 1)
	"""
	elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
	if nth_result > len(elements):
	raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
	result = f"Found {len(elements)} matches for '{text}'."
	elem = elements[nth_result - 1]
	driver.execute_script("arguments[0].scrollIntoView(true);", elem)
	result += f"Focused on element {nth_result} of {len(elements)}"
	return result


	@tool
	def go_back() -> None:
	"""Goes back to previous page."""
	driver.back()


	@tool
	def close_popups() -> str:
	"""
	Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
	"""
	webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

	def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
	sleep(1.0) # Let JavaScript animations happen before taking the screenshot
	driver = helium.get_driver()
	current_step = step_log.step_number
	if driver is not None:
	for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
	if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
	step_logs.observations_images = None
	png_bytes = driver.get_screenshot_as_png()
	image = Image.open(BytesIO(png_bytes))
	print(f"Captured a browser screenshot: {image.size} pixels")
	step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!

	# Update observations with current URL
	url_info = f"Current url: {driver.current_url}"
	step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
	return

	from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
	model = OpenAIServerModel(model_id="gpt-4o")

	############# OpenAIServerModel: Connects to any service that offers an OpenAI API interface.
	#model = OpenAIServerModel(
	# model_id="gpt-4o",
	# api_base="https://api.openai.com/v1",
	# api_key=os.environ["OPENAI_API_KEY"],
	#)

	agent = CodeAgent(
	tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
	model=model,
	additional_authorized_imports=["helium"],
	step_callbacks=[save_screenshot],
	max_steps=20,
	verbosity_level=2,
	)

	prompt_analysis="""Extract information from an image by analyzing and interpreting its visual elements to provide a detailed description or identify specific data.

	# Steps

	1. Analyze the Image: Identify key elements such as objects, text, colors, and any notable features or contexts.
	2. Interpret Visual Elements: Determine the significance or purpose of the elements identified. Consider relationships between objects, text recognition if applicable, and any context clues.
	3. Synthesize Information: Bring together the interpreted elements to form a coherent understanding or summary.
	4. Verify Details: Ensure accuracy by cross-referencing identifiable text or icons with known data or references, if relevant.

	# Output Format

	The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each identified element should be clearly described along with its context or significance.

	# Example

	Input: (An image with a storefront displaying 'Bakery' sign and a variety of bread on display.)

	Output:

	Description:
	- Storefront: A bakery
	- Signage Text: "Bakery"
	- Products: Various types of bread

	JSON Example:
	```json
	{
	"storeType": "Bakery",
	"signText": "Bakery",
	"products": ["bread", "baguette", "pastry"]
	}
	```

	# Notes

	- Consider optical character recognition (OCR) for text extraction.
	- Evaluate colors and objects for brand or function associations.
	- Provide a holistic overview rather than disjointed elements when possible."""


	prompt_deep_analysis="""Extract information from a video by analyzing and interpreting its audiovisual elements to provide a detailed description or identify specific data.

	You will have specific information to retrieve from the video. Adapt analysis steps to cater for motion, audio, and potential scene changes unique to video content.

	# Steps

	1. Parse the Video: Break down the video into manageable segments, focusing on scenes or timeframes relevant to the target information.
	2. Identify Key Elements: Within these segments, identify crucial visual and audio elements such as objects, text, dialogue, sounds, and any notable features or contexts.
	3. Interpret Audiovisual Elements: Determine the significance or purpose of the identified elements. Consider relationships between objects, text recognition, audio cues, and any context provided by the video.
	4. Synthesize Information: Integrate the interpreted elements to form a coherent understanding or summary.
	5. Verify Details: Ensure accuracy by cross-referencing identifiable text, icons, or audio snippets with known data or references, if relevant.

	# Output Format

	The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each element should be described along with its context or significance within the video.

	# Examples

	Input: (A video of a cooking show with captions and background music.)

	Output:

	Description:
	- Scene: Cooking demonstration of a pasta dish
	- Captions: Step-by-step instructions
	- Audio: Background music, presenter dialogue
	- Visual Elements: Ingredients and cooking utensils

	JSON Example:
	```json
	{
	"sceneType": "Cooking Demonstration",
	"captions": ["Boil water", "Add pasta"],
	"audio": {
	"backgroundMusic": "light jazz",
	"dialogue": ["Today we are making pasta..."]
	},
	"visualElements": ["pasta", "saucepan", "spoon"]
	}
	```

	# Notes

	- Consider using video timestamp and scene identification for accurate element referencing.
	- Evaluate both visual and audio elements for context comprehension.
	- Ensure that video dynamics like scene changes or motion are accounted for in the synthesis of information."""