|
from smolagents import DuckDuckGoSearchTool, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool, PythonInterpreterTool, tool |
|
|
|
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor, pipeline |
|
from qwen_vl_utils import process_vision_info |
|
import torch |
|
|
|
from typing import List, Any, Optional |
|
from markdownify import markdownify |
|
from tavily import TavilyClient |
|
|
|
import os |
|
import uuid |
|
import json |
|
import traceback |
|
import requests |
|
import datetime |
|
import yt_dlp |
|
import pandas as pd |
|
import wikipedia as wiki |
|
from bs4 import BeautifulSoup |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from markdownify import markdownify as md |
|
|
|
|
|
@tool |
|
def video_analyzer(file_path: str, query: str) -> str: |
|
""" |
|
|
|
An artificial intelligence tool that takes as input a text string containing |
|
the absolute path to a video file in MP4 format and a string with |
|
a detailed text query to analyze the video. |
|
|
|
Args: |
|
file_path: Absolute path to an Excel file. |
|
query: detailed text query to analyze the video. |
|
|
|
Returns: |
|
str: Row of text with the results of video file analysis |
|
|
|
Examples: |
|
>>> video_analyzer("/test/1.mp4", "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?") |
|
The video shows a group of Emperor penguins and a single Albatross. Therefore, the highest number of bird species to be on camera simultaneously is 2. |
|
|
|
""" |
|
|
|
model = Qwen2_5_VLForConditionalGeneration.from_pretrained( |
|
"Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto" |
|
) |
|
|
|
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct") |
|
|
|
text = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. " + query |
|
|
|
messages = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "video", "video": f"file://{file_path}", "fps": 1.0,}, |
|
{"type": "text", "text": text}, |
|
], |
|
} |
|
] |
|
|
|
|
|
text = processor.apply_chat_template( |
|
messages, tokenize=False, add_generation_prompt=True |
|
) |
|
image_inputs, video_inputs = process_vision_info(messages) |
|
inputs = processor( |
|
text=[text], |
|
images=image_inputs, |
|
videos=video_inputs, |
|
padding=True, |
|
return_tensors="pt", |
|
) |
|
inputs = inputs.to("cuda") |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=128) |
|
generated_ids_trimmed = [ |
|
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
|
] |
|
output_text = processor.batch_decode( |
|
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
) |
|
|
|
return output_text[0] |
|
|
|
|
|
|
|
@tool |
|
def wikipedia_available_titles(query: str) -> List[str]: |
|
"""This insturment returns the titles of the articles available on wikipedia." |
|
|
|
Args: |
|
query: str |
|
The query that will be used to search for articles on wikipedia. |
|
|
|
Returns: |
|
list : list of strings with available article titles |
|
|
|
""" |
|
try: |
|
wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) |
|
titles = wiki.search(query) |
|
except Exception as e: |
|
print("Exception occurred: ", e, "with query: ", query) |
|
|
|
return titles |
|
|
|
|
|
@tool |
|
def wikipedia_summary(title: str) -> str: |
|
"""This instrument returns the summary of a wikipedia article. |
|
|
|
Args: |
|
title: str |
|
The title of the wikipedia article to summarize. |
|
|
|
Returns: |
|
str : The summary of the article. |
|
""" |
|
try: |
|
wiki.set_rate_limiting(rate_limit=True, min_wait=datetime.timedelta(milliseconds=100)) |
|
summary = wiki.summary(title, ) |
|
except Exception as e: |
|
print("Exception occurred: ", e, "with title: ", title) |
|
summary = "" |
|
|
|
return summary |
|
|
|
|
|
@tool |
|
def reverse_text(text: str) -> str: |
|
"""This tool returns a reversed string of text. |
|
|
|
Args: |
|
text: str |
|
The line of text to be reversed |
|
|
|
Returns: |
|
str : Reversed line of text. |
|
|
|
Examples: |
|
>>> reverse_text("ecnetnes siht dnatsrednu uoy fI") |
|
If you understand this sentence |
|
|
|
""" |
|
return text[::-1] |
|
|
|
|
|
tavily_access_token = os.getenv("TAVILY_ACCESS_TOKEN") |
|
|
|
|
|
@tool |
|
def tavily_search(request: str) -> str: |
|
""" |
|
This is an ultimatum tool for finding information on the internet. |
|
Don't use it to search YouTube! It's useless! |
|
|
|
Args: |
|
request: A string containing a query to search in the Internet. |
|
|
|
Returns: |
|
str: JSON string with execution results containing the following fields: |
|
- query: The search query to execute with Tavily. |
|
- answer: A short answer to the user's query, generated by an LLM. Included in the response only if include_answer is requested |
|
- images: List of query-related images. If include_image_descriptions is true, each item will have url and description. |
|
- results: A list of sorted search results, ranked by relevancy. Contains the following fields: |
|
- title: The title of the search result. |
|
- url: The URL of the search result. |
|
- content: A short description of the search result. |
|
- score: The relevance score of the search result. |
|
- raw_content: The cleaned and parsed HTML content of the search result. Only if include_raw_content is true. |
|
""" |
|
|
|
client = TavilyClient(tavily_access_token) |
|
response = client.search(query=request, include_raw_content=False, max_results=3, search_depth='advanced') |
|
|
|
return response |
|
|
|
@tool |
|
def tavily_extract_web_page(url: str) -> str: |
|
""" |
|
This is an ultimatum tool that allows you to retrieve the contents of a web page. |
|
In other words, to view the website. Don't use YouTube to extract pages! It's useless! |
|
|
|
Args: |
|
url: The URL of the web page from which you want to retrieve information. |
|
|
|
Returns: |
|
str: The parsed and cleaned HTML content of the web page. The raw content extracted. |
|
""" |
|
|
|
client = TavilyClient(tavily_access_token) |
|
response = client.extract([url], extract_depth="advanced") |
|
|
|
return response["results"][0]['raw_content'] |
|
|
|
|
|
@tool |
|
def download_youtube_video_audio(url: str) -> tuple[bool, str, str]: |
|
""" |
|
Downloads a YouTube video to a specified directory. Video and audio are downloaded separately. |
|
The video is downloaded in mp4 format and the audio in mp3 format. |
|
|
|
Args: |
|
url: The URL of the YouTube video. |
|
|
|
Returns: |
|
Returns three strings: |
|
bool: Execution result. True - success, False - error in file upload process. |
|
str: The absolute path to the downloaded video file. |
|
str: The absolute path to the downloaded audio file. |
|
""" |
|
try: |
|
|
|
guid = str(uuid.uuid4()) |
|
output_dir="./downloads" |
|
|
|
abs_output_dir = os.path.abspath(output_dir) |
|
|
|
video_path = os.path.join(abs_output_dir, f"{guid}.mp4") |
|
audio_path = os.path.join(abs_output_dir, f"{guid}.mp3") |
|
|
|
format_priority = ( |
|
'bestvideo[height=360][ext=mp4]/' |
|
'bestvideo[height<360][ext=mp4]/' |
|
'worstvideo[height>=360]' |
|
) |
|
|
|
video_options = { |
|
'format': format_priority, |
|
'outtmpl': video_path, |
|
'quiet': True, |
|
'no_warnings': True, |
|
} |
|
|
|
|
|
audio_options = { |
|
'format': 'bestaudio/best[ext=mp3]', |
|
'outtmpl': audio_path, |
|
'quiet': True, |
|
'no_warnings': True, |
|
} |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
with yt_dlp.YoutubeDL(video_options) as ydl: |
|
ydl.download([url]) |
|
|
|
with yt_dlp.YoutubeDL(audio_options) as ydl: |
|
ydl.download([url]) |
|
|
|
return True, video_path, audio_path |
|
|
|
except Exception as e: |
|
|
|
|
|
for path in [video_path, audio_path]: |
|
try: |
|
os.remove(path) |
|
except: |
|
pass |
|
|
|
return False, None, None |
|
|
|
|
|
@tool |
|
def transcribe_audio_file(path: str) -> str: |
|
""" |
|
The tool takes as input the absolute path to the mp3 file to be transcribed and returns the English text. |
|
|
|
Args: |
|
path: Absolute path to an audio file in mp3 format. |
|
|
|
Returns: |
|
str: A string of transcripts of an audio file in English. |
|
""" |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
transcribe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-base", |
|
chunk_length_s=30, |
|
batch_size=2, |
|
device=device, |
|
) |
|
try: |
|
transcription = transcribe(path, batch_size=8, generate_kwargs={"language": "english", "task": "transcribe"})["text"] |
|
except Exception as e: |
|
print("ERROR: {e}, {path}") |
|
traceback.print_exc() |
|
return None |
|
|
|
return transcription |
|
|
|
@tool |
|
def get_excel_data(file_path: str) -> pd.DataFrame: |
|
""" |
|
The tool takes as input an absolute path to the Excel file whose contents are to be output and returns a string of text with the contents of the file. |
|
|
|
Args: |
|
file_path: Absolute path to an Excel file. |
|
|
|
Returns: |
|
str: A row with the contents of an Excel file |
|
""" |
|
return str(pd.read_excel(file_path)) |
|
|
|
|
|
@tool |
|
def multiply(a: int, b: int) -> int: |
|
"""Multiply two numbers. |
|
Args: |
|
a: first int |
|
b: second int |
|
""" |
|
return a * b |
|
|
|
@tool |
|
def add(a: int, b: int) -> int: |
|
"""Add two numbers. |
|
|
|
Args: |
|
a: first int |
|
b: second int |
|
""" |
|
return a + b |
|
|
|
@tool |
|
def subtract(a: int, b: int) -> int: |
|
"""Subtract two numbers. |
|
|
|
Args: |
|
a: first int |
|
b: second int |
|
""" |
|
return a - b |
|
|
|
@tool |
|
def divide(a: int, b: int) -> int: |
|
"""Divide two numbers. |
|
|
|
Args: |
|
a: first int |
|
b: second int |
|
""" |
|
if b == 0: |
|
raise ValueError("Cannot divide by zero.") |
|
return a / b |
|
|
|
@tool |
|
def modulus(a: int, b: int) -> int: |
|
"""Get the modulus of two numbers. |
|
|
|
Args: |
|
a: first int |
|
b: second int |
|
""" |
|
return a % b |
|
|
|
|
|
available_tools = [ |
|
reverse_text, |
|
multiply, |
|
add, |
|
subtract, |
|
divide, |
|
modulus, |
|
download_youtube_video_audio, |
|
transcribe_audio_file, |
|
get_excel_data, |
|
wikipedia_available_titles, |
|
wikipedia_summary, |
|
video_analyzer, |
|
FinalAnswerTool(), |
|
DuckDuckGoSearchTool(), |
|
tavily_search, |
|
tavily_extract_web_page, |
|
|
|
PythonInterpreterTool(), |
|
|
|
|
|
] |
|
|
|
|
|
if __name__ == "__main__": |
|
file = "/workspaces/Final_Assignment_Template/downloads/60cc887f-cb60-4fc6-88c8-a8bbc6a4659a.mp4" |
|
text = "Identify separate bird species. What is the highest number of bird species to be on camera simultaneously?" |
|
|
|
print(video_analyzer(file, text)) |