Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Commit 
							
							·
						
						6671679
	
1
								Parent(s):
							
							7f23ebc
								
Solved loading transcripts
Browse files- Data/yt_transcript.py +31 -12
    	
        Data/yt_transcript.py
    CHANGED
    
    | @@ -56,29 +56,48 @@ def fetch_yt_transcript(video_ids): | |
| 56 |  | 
| 57 |  | 
| 58 | 
             
            def all_video_transcript_pipeline():
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 59 | 
             
                video_links_list, new_video_added, new_videos_link = video_links_main()
         | 
| 60 | 
             
                video_transcripts = {}
         | 
| 61 |  | 
| 62 | 
            -
                #  | 
| 63 | 
            -
                transcripts_folder = "transcripts"
         | 
| 64 | 
             
                if os.path.exists(transcripts_folder):
         | 
| 65 | 
             
                    existing_files = os.listdir(transcripts_folder)
         | 
|  | |
|  | |
| 66 | 
             
                    for file in existing_files:
         | 
| 67 | 
            -
                         | 
| 68 | 
            -
             | 
| 69 | 
            -
                             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 74 |  | 
| 75 | 
             
                # Then fetch new transcripts if there are any
         | 
| 76 | 
            -
                if new_video_added:
         | 
| 77 | 
            -
                    print("New videos  | 
| 78 | 
            -
                     | 
|  | |
| 79 | 
             
                    # Merge new transcripts with existing ones
         | 
| 80 | 
             
                    video_transcripts.update(new_transcripts)
         | 
|  | |
| 81 |  | 
|  | |
| 82 | 
             
                return video_transcripts
         | 
| 83 |  | 
| 84 |  | 
|  | |
| 56 |  | 
| 57 |  | 
| 58 | 
             
            def all_video_transcript_pipeline():
         | 
| 59 | 
            +
                # Get the Data directory path
         | 
| 60 | 
            +
                CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
         | 
| 61 | 
            +
                transcripts_folder = os.path.join(CURRENT_DIR, "transcripts")
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                print(f"Looking for transcripts in: {transcripts_folder}")
         | 
| 64 | 
             
                video_links_list, new_video_added, new_videos_link = video_links_main()
         | 
| 65 | 
             
                video_transcripts = {}
         | 
| 66 |  | 
| 67 | 
            +
                # Always load existing transcripts
         | 
|  | |
| 68 | 
             
                if os.path.exists(transcripts_folder):
         | 
| 69 | 
             
                    existing_files = os.listdir(transcripts_folder)
         | 
| 70 | 
            +
                    print(f"Found {len(existing_files)} files in transcripts folder")
         | 
| 71 | 
            +
             | 
| 72 | 
             
                    for file in existing_files:
         | 
| 73 | 
            +
                        if file.endswith('.txt'):  # Make sure we only process text files
         | 
| 74 | 
            +
                            video_id = file.split("_")[0]
         | 
| 75 | 
            +
                            file_path = os.path.join(transcripts_folder, file)
         | 
| 76 | 
            +
                            try:
         | 
| 77 | 
            +
                                with open(file_path, "r", encoding="utf-8") as f:
         | 
| 78 | 
            +
                                    transcript_text = f.read().splitlines()
         | 
| 79 | 
            +
                                video_transcripts[video_id] = {
         | 
| 80 | 
            +
                                    'text': transcript_text,
         | 
| 81 | 
            +
                                    'file_path': file_path
         | 
| 82 | 
            +
                                }
         | 
| 83 | 
            +
                                print(f"Loaded transcript for video: {video_id}")
         | 
| 84 | 
            +
                            except Exception as e:
         | 
| 85 | 
            +
                                print(f"Error loading transcript {file}: {e}")
         | 
| 86 | 
            +
                else:
         | 
| 87 | 
            +
                    print(f"Transcripts folder not found at: {transcripts_folder}")
         | 
| 88 | 
            +
                    os.makedirs(transcripts_folder)
         | 
| 89 | 
            +
                    print(f"Created transcripts folder at: {transcripts_folder}")
         | 
| 90 |  | 
| 91 | 
             
                # Then fetch new transcripts if there are any
         | 
| 92 | 
            +
                if new_video_added and new_videos_link:
         | 
| 93 | 
            +
                    print("New videos have been added... Fetching transcripts for new videos")
         | 
| 94 | 
            +
                    new_video_ids = [url.split("v=")[1] for url in new_videos_link]  # Extract video IDs
         | 
| 95 | 
            +
                    new_transcripts = fetch_yt_transcript(new_video_ids)
         | 
| 96 | 
             
                    # Merge new transcripts with existing ones
         | 
| 97 | 
             
                    video_transcripts.update(new_transcripts)
         | 
| 98 | 
            +
                    print(f"Added {len(new_transcripts)} new transcripts")
         | 
| 99 |  | 
| 100 | 
            +
                print(f"Total transcripts loaded: {len(video_transcripts)}")
         | 
| 101 | 
             
                return video_transcripts
         | 
| 102 |  | 
| 103 |  |