Spaces:
Runtime error
Runtime error
| import time | |
| from dataclasses import asdict | |
| import pandas as pd | |
| import wandb | |
| from langchain.document_loaders import YoutubeLoader | |
| from pytube import Playlist, YouTube | |
| from tqdm import tqdm | |
| from config import config | |
| def retry_access_yt_object(url, max_retries=5, interval_secs=5): | |
| """ | |
| Retries creating a YouTube object with the given URL and accessing its title several times | |
| with a given interval in seconds, until it succeeds or the maximum number of attempts is reached. | |
| If the object still cannot be created or the title cannot be accessed after the maximum number | |
| of attempts, the last exception is raised. | |
| """ | |
| last_exception = None | |
| for i in range(max_retries): | |
| try: | |
| yt = YouTube(url) | |
| title = yt.title # Access the title of the YouTube object. | |
| return yt # Return the YouTube object if successful. | |
| except Exception as err: | |
| last_exception = err # Keep track of the last exception raised. | |
| print( | |
| f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})" | |
| ) | |
| time.sleep(interval_secs) # Wait for the specified interval before retrying. | |
| # If the YouTube object still cannot be created or the title cannot be accessed after the maximum number of attempts, raise the last exception. | |
| raise last_exception | |
| if __name__ == "__main__": | |
| run = wandb.init(project=config.project_name, job_type="dataset", config=asdict(config)) | |
| playlist = Playlist(config.playlist_url) | |
| playlist_video_urls = playlist.video_urls | |
| print(f"There are total {len(playlist_video_urls)} videos in the playlist.") | |
| video_data = [] | |
| for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)): | |
| try: | |
| curr_video_data = {} | |
| yt = retry_access_yt_object(video, max_retries=25, interval_secs=2) | |
| curr_video_data["title"] = yt.title | |
| curr_video_data["url"] = video | |
| curr_video_data["duration"] = yt.length | |
| curr_video_data["publish_date"] = yt.publish_date.strftime("%Y-%m-%d") | |
| loader = YoutubeLoader.from_youtube_url(video) | |
| transcript = loader.load()[0].page_content | |
| transcript = " ".join(transcript.split()) | |
| curr_video_data["transcript"] = transcript | |
| curr_video_data["total_words"] = len(transcript.split()) | |
| video_data.append(curr_video_data) | |
| except: | |
| print(f"Failed to scrape {video}") | |
| print(f"Total podcast episodes scraped: {len(video_data)}") | |
| # save the scraped data to a csv file | |
| df = pd.DataFrame(video_data) | |
| data_path = config.root_data_dir / "yt_podcast_transcript.csv" | |
| df.to_csv(data_path, index=False) | |
| # upload the scraped data to wandb | |
| artifact = wandb.Artifact("yt_podcast_transcript", type="dataset") | |
| artifact.add_file(data_path) | |
| run.log_artifact(artifact) | |
| # create wandb table | |
| table = wandb.Table(dataframe=df) | |
| run.log({"yt_podcast_transcript": table}) | |
| run.finish() | |