Spaces:
Sleeping
Sleeping
from smolagents import Tool, tool | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
import torch | |
def fetch_webpage(url: str, convert_to_markdown: bool = True) -> str: | |
""" | |
Visit a website / url and fetch the content of the webpage. | |
if markdown conversion is enabled, it will remove script and style and return the text content as markdown else return raw unfiltered HTML | |
Args: | |
url (str): The URL to fetch. | |
convert_to_markdown (bool): If True, convert the HTML content to Markdown format. else return the raw HTML. | |
Returns: | |
str: The HTML content of the URL. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify as md | |
content = None | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, timeout=30, headers=headers) | |
if (convert_to_markdown): | |
soup = BeautifulSoup(response.text, "html.parser") | |
# remove script and style tags | |
for script in soup(["script", "style"]): | |
script.extract() | |
# for wikipedia only keep the main content | |
if "wikipedia.org" in url: | |
elements_to_remove = [ | |
# Navigation and reference elements | |
{'class': 'navbox'}, | |
{'class': 'navbox-group'}, | |
{'class': 'reflist'}, | |
{'class': 'navigation-box'}, | |
{'class': 'sister-project'}, | |
{'class': 'metadata'}, | |
{'class': 'interlanguage-link'}, | |
{'class': 'catlinks'}, | |
{'id': 'References'}, | |
{'id': 'External_links'}, | |
{'id': 'Further_reading'}, | |
{'id': 'See_also'}, | |
{'id': 'Notes'}, | |
] | |
for selector in elements_to_remove: | |
elements = soup.find_all(attrs=selector) | |
for element in elements: | |
# For ID-based elements, remove the parent section | |
if 'id' in selector: | |
parent = element.parent | |
if parent and parent.name in ['h2', 'h3', 'h4']: | |
# Remove heading and all content until next heading | |
current = parent | |
while current and current.next_sibling: | |
next_elem = current.next_sibling | |
if (hasattr(next_elem, 'name') and | |
next_elem.name in ['h2', 'h3', 'h4']): | |
break | |
if hasattr(next_elem, 'decompose'): | |
next_elem.decompose() | |
else: | |
current = next_elem | |
parent.decompose() | |
else: | |
element.decompose() | |
main_content = soup.find("main",{"id":"content"}) | |
if main_content: | |
content = md(str(main_content),strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = md(response.text,strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
# Fallback for all other sites - from chatgpt - not tested | |
content = md(str(soup), strip=['script', 'style'], heading_style="ATX").strip() | |
else: | |
content = response.text | |
return content | |
def read_file_tool(file_path: str) -> str: | |
""" | |
Tool to read a file and return its content. | |
Args: | |
file_path (str): Path to the file to read. | |
Returns: | |
str: Content of the file or error message. | |
""" | |
try: | |
with open(file_path, "r") as file: | |
return file.read() | |
except Exception as e: | |
return f"Error reading file: {str(e)}" | |
def get_youtube_transcript(video_id: str) -> str: | |
""" | |
Fetches the transcript of a YouTube video given its video ID. | |
Args: | |
video_id (str): The ID of the YouTube video. Pass in the video ID, NOT the video URL. For a video with the URL https://www.youtube.com/watch?v=12345 the ID is 12345. | |
Returns: | |
str: The transcript of the YouTube video. as a single string with each line separated by a newline character. | |
""" | |
# Initialize the YouTubeTranscriptApi | |
ytt_api = YouTubeTranscriptApi() | |
fetched_transcript = ytt_api.fetch(video_id) | |
raw_data = fetched_transcript.to_raw_data() | |
# raw data is in the form of [{ 'text': 'Hey there', 'start': 0.0, 'duration': 1.54 }, { 'text': 'how are you',, 'start': 1.54, 'duration': 4.16 }, ... ] we will return ony the text element as lines | |
transcript = "\n".join([item['text'] for item in raw_data]) | |
return transcript | |
def transcribe_audio(audio_path: str) -> str: | |
""" | |
Speech to Text - transcribes audio file and returns the text | |
Args: | |
audio_path (str): Local file path to the audio | |
Returns: | |
str: The transcript of the audio file | |
""" | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-small" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
torch_dtype=torch_dtype, | |
device=device, | |
chunk_length_s=30, | |
) | |
result = pipe(audio_path) | |
return result | |