Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tool.py

navodit17

clean wiki, stt, model upgrade

100ac43 2 months ago

raw

history blame contribute delete

6.09 kB

	from smolagents import Tool, tool
	from youtube_transcript_api import YouTubeTranscriptApi
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import torch

	@tool
	def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str:
	"""
	Visit a website / url and fetch the content of the webpage.
	if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML
	Args:
	url (str): The URL to fetch.
	convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML.
	Returns:
	str: The HTML content of the URL.
	"""
	import requests
	from bs4 import BeautifulSoup
	from markdownify import markdownify as md

	content = None
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, timeout=30, headers=headers)

	if (convert_to_markdown):
	soup = BeautifulSoup(response.text, "html.parser")

	# remove script and style tags
	for script in soup(["script", "style"]):
	script.extract()

	# for wikipedia only keep the main content
	if "wikipedia.org" in url:

	elements_to_remove = [
	# Navigation and reference elements
	{'class': 'navbox'},
	{'class': 'navbox-group'},
	{'class': 'reflist'},
	{'class': 'navigation-box'},
	{'class': 'sister-project'},
	{'class': 'metadata'},
	{'class': 'interlanguage-link'},
	{'class': 'catlinks'},
	{'id': 'References'},
	{'id': 'External_links'},
	{'id': 'Further_reading'},
	{'id': 'See_also'},
	{'id': 'Notes'},
	]

	for selector in elements_to_remove:
	elements = soup.find_all(attrs=selector)
	for element in elements:
	# For ID-based elements, remove the parent section
	if 'id' in selector:
	parent = element.parent
	if parent and parent.name in ['h2', 'h3', 'h4']:
	# Remove heading and all content until next heading
	current = parent
	while current and current.next_sibling:
	next_elem = current.next_sibling
	if (hasattr(next_elem, 'name') and
	next_elem.name in ['h2', 'h3', 'h4']):
	break
	if hasattr(next_elem, 'decompose'):
	next_elem.decompose()
	else:
	current = next_elem
	parent.decompose()
	else:
	element.decompose()

	main_content = soup.find("main",{"id":"content"})
	if main_content:
	content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip()
	else:
	# Fallback for all other sites - from chatgpt - not tested
	content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip()
	else:
	content = response.text

	return content


	@tool
	def read_file_tool(file_path: str) -> str:
	"""
	Tool to read a file and return its content.

	Args:
	file_path (str): Path to the file to read.

	Returns:
	str: Content of the file or error message.
	"""
	try:
	with open(file_path, "r") as file:
	return file.read()
	except Exception as e:
	return f"Error reading file: {str(e)}"


	@tool
	def get_youtube_transcript(video_id: str) -> str:
	"""
	Fetches the transcript of a YouTube video given its video ID.
	Args:
	video_id (str): The ID of the YouTube video. Pass in the video ID, NOT the video URL. For a video with the URL https://www.youtube.com/watch?v=12345 the ID is 12345.
	Returns:
	str: The transcript of the YouTube video. as a single string with each line separated by a newline character.
	"""
	# Initialize the YouTubeTranscriptApi
	ytt_api = YouTubeTranscriptApi()
	fetched_transcript = ytt_api.fetch(video_id)
	raw_data = fetched_transcript.to_raw_data()
	# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines
	transcript = "\n".join([item['text'] for item in raw_data])
	return transcript


	@tool
	def transcribe_audio(audio_path: str) -> str:
	"""
	Speech to Text - transcribes audio file and returns the text

	Args:
	audio_path (str): Local file path to the audio

	Returns:
	str: The transcript of the audio file
	"""

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-small"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	chunk_length_s=30,
	)

	result = pipe(audio_path)
	return result